qbhf2's picture
added NvidiaWarp and GarmentCode repos
66c9c8a
# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
# NVIDIA CORPORATION and its licensors retain all intellectual property
# and proprietary rights in and to this software, related documentation
# and any modifications thereto. Any use, reproduction, disclosure or
# distribution of this software and related documentation without an express
# license agreement from NVIDIA CORPORATION is strictly prohibited.
from __future__ import annotations
import builtins
import ctypes
import hashlib
import inspect
import struct
import zlib
from typing import Any, Callable, Generic, List, Tuple, TypeVar, Union
import numpy as np
import warp
# type hints
Length = TypeVar("Length", bound=int)
Rows = TypeVar("Rows")
Cols = TypeVar("Cols")
DType = TypeVar("DType")
Int = TypeVar("Int")
Float = TypeVar("Float")
Scalar = TypeVar("Scalar")
Vector = Generic[Length, Scalar]
Matrix = Generic[Rows, Cols, Scalar]
Quaternion = Generic[Float]
Transformation = Generic[Float]
DType = TypeVar("DType")
Array = Generic[DType]
T = TypeVar("T")
# shared hash for all constants
_constant_hash = hashlib.sha256()
def constant(x):
"""Function to declare compile-time constants accessible from Warp kernels
Args:
x: Compile-time constant value, can be any of the built-in math types.
"""
global _constant_hash
# hash the constant value
if isinstance(x, builtins.bool):
# This needs to come before the check for `int` since all boolean
# values are also instances of `int`.
_constant_hash.update(struct.pack("?", x))
elif isinstance(x, int):
_constant_hash.update(struct.pack("<q", x))
elif isinstance(x, float):
_constant_hash.update(struct.pack("<d", x))
elif isinstance(x, float16):
# float16 is a special case
p = ctypes.pointer(ctypes.c_float(x.value))
_constant_hash.update(p.contents)
elif isinstance(x, tuple(scalar_types)):
p = ctypes.pointer(x._type_(x.value))
_constant_hash.update(p.contents)
elif isinstance(x, ctypes.Array):
_constant_hash.update(bytes(x))
else:
raise RuntimeError(f"Invalid constant type: {type(x)}")
return x
def float_to_half_bits(value):
return warp.context.runtime.core.float_to_half_bits(value)
def half_bits_to_float(value):
return warp.context.runtime.core.half_bits_to_float(value)
# ----------------------
# built-in types
def vector(length, dtype):
# canonicalize dtype
if dtype == int:
dtype = int32
elif dtype == float:
dtype = float32
class vec_t(ctypes.Array):
# ctypes.Array data for length, shape and c type:
_length_ = 0 if length is Any else length
_shape_ = (_length_,)
_type_ = ctypes.c_float if dtype in [Scalar, Float] else dtype._type_
# warp scalar type:
_wp_scalar_type_ = dtype
_wp_type_params_ = [length, dtype]
_wp_generic_type_str_ = "vec_t"
_wp_constructor_ = "vector"
# special handling for float16 type: in this case, data is stored
# as uint16 but it's actually half precision floating point
# data. This means we need to convert each of the arguments
# to uint16s containing half float bits before storing them in
# the array:
scalar_import = float_to_half_bits if _wp_scalar_type_ == float16 else lambda x: x
scalar_export = half_bits_to_float if _wp_scalar_type_ == float16 else lambda x: x
def __init__(self, *args):
num_args = len(args)
if num_args == 0:
super().__init__()
elif num_args == 1:
if hasattr(args[0], "__len__"):
# try to copy from expanded sequence, e.g. (1, 2, 3)
self.__init__(*args[0])
else:
# set all elements to the same value
value = vec_t.scalar_import(args[0])
for i in range(self._length_):
super().__setitem__(i, value)
elif num_args == self._length_:
# set all scalar elements
for i in range(self._length_):
super().__setitem__(i, vec_t.scalar_import(args[i]))
else:
raise ValueError(
f"Invalid number of arguments in vector constructor, expected {self._length_} elements, got {num_args}"
)
def __getitem__(self, key):
if isinstance(key, int):
return vec_t.scalar_export(super().__getitem__(key))
elif isinstance(key, slice):
if self._wp_scalar_type_ == float16:
return [vec_t.scalar_export(x) for x in super().__getitem__(key)]
else:
return super().__getitem__(key)
else:
raise KeyError(f"Invalid key {key}, expected int or slice")
def __setitem__(self, key, value):
if isinstance(key, int):
super().__setitem__(key, vec_t.scalar_import(value))
return value
elif isinstance(key, slice):
if self._wp_scalar_type_ == float16:
super().__setitem__(key, [vec_t.scalar_import(x) for x in value])
return value
else:
return super().__setitem__(key, value)
else:
raise KeyError(f"Invalid key {key}, expected int or slice")
def __getattr__(self, name):
idx = "xyzw".find(name)
if idx != -1:
return self.__getitem__(idx)
return self.__getattribute__(name)
def __setattr__(self, name, value):
idx = "xyzw".find(name)
if idx != -1:
return self.__setitem__(idx, value)
return super().__setattr__(name, value)
def __add__(self, y):
return warp.add(self, y)
def __radd__(self, y):
return warp.add(y, self)
def __sub__(self, y):
return warp.sub(self, y)
def __rsub__(self, y):
return warp.sub(y, self)
def __mul__(self, y):
return warp.mul(self, y)
def __rmul__(self, x):
return warp.mul(x, self)
def __truediv__(self, y):
return warp.div(self, y)
def __rtruediv__(self, x):
return warp.div(x, self)
def __pos__(self):
return warp.pos(self)
def __neg__(self):
return warp.neg(self)
def __str__(self):
return f"[{', '.join(map(str, self))}]"
def __eq__(self, other):
for i in range(self._length_):
if self[i] != other[i]:
return False
return True
@classmethod
def from_ptr(cls, ptr):
if ptr:
# create a new vector instance and initialize the contents from the binary data
# this skips float16 conversions, assuming that float16 data is already encoded as uint16
value = cls()
ctypes.memmove(ctypes.byref(value), ptr, ctypes.sizeof(cls._type_) * cls._length_)
return value
else:
raise RuntimeError("NULL pointer exception")
return vec_t
def matrix(shape, dtype):
assert len(shape) == 2
# canonicalize dtype
if dtype == int:
dtype = int32
elif dtype == float:
dtype = float32
class mat_t(ctypes.Array):
_length_ = 0 if shape[0] == Any or shape[1] == Any else shape[0] * shape[1]
_shape_ = (0, 0) if _length_ == 0 else shape
_type_ = ctypes.c_float if dtype in [Scalar, Float] else dtype._type_
# warp scalar type:
# used in type checking and when writing out c++ code for constructors:
_wp_scalar_type_ = dtype
_wp_type_params_ = [shape[0], shape[1], dtype]
_wp_generic_type_str_ = "mat_t"
_wp_constructor_ = "matrix"
_wp_row_type_ = vector(0 if shape[1] == Any else shape[1], dtype)
# special handling for float16 type: in this case, data is stored
# as uint16 but it's actually half precision floating point
# data. This means we need to convert each of the arguments
# to uint16s containing half float bits before storing them in
# the array:
scalar_import = float_to_half_bits if _wp_scalar_type_ == float16 else lambda x: x
scalar_export = half_bits_to_float if _wp_scalar_type_ == float16 else lambda x: x
def __init__(self, *args):
num_args = len(args)
if num_args == 0:
super().__init__()
elif num_args == 1:
if hasattr(args[0], "__len__"):
# try to copy from expanded sequence, e.g. [[1, 0], [0, 1]]
self.__init__(*args[0])
else:
# set all elements to the same value
value = mat_t.scalar_import(args[0])
for i in range(self._length_):
super().__setitem__(i, value)
elif num_args == self._length_:
# set all scalar elements
for i in range(self._length_):
super().__setitem__(i, mat_t.scalar_import(args[i]))
elif num_args == self._shape_[0]:
# row vectors
for i, row in enumerate(args):
if not hasattr(row, "__len__") or len(row) != self._shape_[1]:
raise TypeError(
f"Invalid argument in matrix constructor, expected row of length {self._shape_[1]}, got {row}"
)
offset = i * self._shape_[1]
for i in range(self._shape_[1]):
super().__setitem__(offset + i, mat_t.scalar_import(row[i]))
else:
raise ValueError(
f"Invalid number of arguments in matrix constructor, expected {self._length_} elements, got {num_args}"
)
def __add__(self, y):
return warp.add(self, y)
def __radd__(self, y):
return warp.add(y, self)
def __sub__(self, y):
return warp.sub(self, y)
def __rsub__(self, y):
return warp.sub(y, self)
def __mul__(self, y):
return warp.mul(self, y)
def __rmul__(self, x):
return warp.mul(x, self)
def __matmul__(self, y):
return warp.mul(self, y)
def __rmatmul__(self, x):
return warp.mul(x, self)
def __truediv__(self, y):
return warp.div(self, y)
def __rtruediv__(self, x):
return warp.div(x, self)
def __pos__(self):
return warp.pos(self)
def __neg__(self):
return warp.neg(self)
def __str__(self):
row_str = []
for r in range(self._shape_[0]):
row_val = self.get_row(r)
row_str.append(f"[{', '.join(map(str, row_val))}]")
return "[" + ",\n ".join(row_str) + "]"
def __eq__(self, other):
for i in range(self._shape_[0]):
for j in range(self._shape_[1]):
if self[i][j] != other[i][j]:
return False
return True
def get_row(self, r):
if r < 0 or r >= self._shape_[0]:
raise IndexError("Invalid row index")
row_start = r * self._shape_[1]
row_end = row_start + self._shape_[1]
row_data = super().__getitem__(slice(row_start, row_end))
if self._wp_scalar_type_ == float16:
return self._wp_row_type_(*[mat_t.scalar_export(x) for x in row_data])
else:
return self._wp_row_type_(row_data)
def set_row(self, r, v):
if r < 0 or r >= self._shape_[0]:
raise IndexError("Invalid row index")
row_start = r * self._shape_[1]
row_end = row_start + self._shape_[1]
if self._wp_scalar_type_ == float16:
v = [mat_t.scalar_import(x) for x in v]
super().__setitem__(slice(row_start, row_end), v)
def __getitem__(self, key):
if isinstance(key, Tuple):
# element indexing m[i,j]
if len(key) != 2:
raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
return mat_t.scalar_export(super().__getitem__(key[0] * self._shape_[1] + key[1]))
elif isinstance(key, int):
# row vector indexing m[r]
return self.get_row(key)
else:
raise KeyError(f"Invalid key {key}, expected int or pair of ints")
def __setitem__(self, key, value):
if isinstance(key, Tuple):
# element indexing m[i,j] = x
if len(key) != 2:
raise KeyError(f"Invalid key, expected one or two indices, got {len(key)}")
super().__setitem__(key[0] * self._shape_[1] + key[1], mat_t.scalar_import(value))
return value
elif isinstance(key, int):
# row vector indexing m[r] = v
self.set_row(key, value)
return value
else:
raise KeyError(f"Invalid key {key}, expected int or pair of ints")
@classmethod
def from_ptr(cls, ptr):
if ptr:
# create a new matrix instance and initialize the contents from the binary data
# this skips float16 conversions, assuming that float16 data is already encoded as uint16
value = cls()
ctypes.memmove(ctypes.byref(value), ptr, ctypes.sizeof(cls._type_) * cls._length_)
return value
else:
raise RuntimeError("NULL pointer exception")
return mat_t
class void:
def __init__(self):
pass
class bool:
_length_ = 1
_type_ = ctypes.c_bool
def __init__(self, x=False):
self.value = x
class float16:
_length_ = 1
_type_ = ctypes.c_uint16
def __init__(self, x=0.0):
self.value = x
class float32:
_length_ = 1
_type_ = ctypes.c_float
def __init__(self, x=0.0):
self.value = x
class float64:
_length_ = 1
_type_ = ctypes.c_double
def __init__(self, x=0.0):
self.value = x
class int8:
_length_ = 1
_type_ = ctypes.c_int8
def __init__(self, x=0):
self.value = x
class uint8:
_length_ = 1
_type_ = ctypes.c_uint8
def __init__(self, x=0):
self.value = x
class int16:
_length_ = 1
_type_ = ctypes.c_int16
def __init__(self, x=0):
self.value = x
class uint16:
_length_ = 1
_type_ = ctypes.c_uint16
def __init__(self, x=0):
self.value = x
class int32:
_length_ = 1
_type_ = ctypes.c_int32
def __init__(self, x=0):
self.value = x
class uint32:
_length_ = 1
_type_ = ctypes.c_uint32
def __init__(self, x=0):
self.value = x
class int64:
_length_ = 1
_type_ = ctypes.c_int64
def __init__(self, x=0):
self.value = x
class uint64:
_length_ = 1
_type_ = ctypes.c_uint64
def __init__(self, x=0):
self.value = x
def quaternion(dtype=Any):
class quat_t(vector(length=4, dtype=dtype)):
pass
# def __init__(self, *args):
# super().__init__(args)
ret = quat_t
ret._wp_type_params_ = [dtype]
ret._wp_generic_type_str_ = "quat_t"
ret._wp_constructor_ = "quaternion"
return ret
class quath(quaternion(dtype=float16)):
pass
class quatf(quaternion(dtype=float32)):
pass
class quatd(quaternion(dtype=float64)):
pass
def transformation(dtype=Any):
class transform_t(vector(length=7, dtype=dtype)):
_wp_init_from_components_sig_ = inspect.Signature(
(
inspect.Parameter(
"p",
inspect.Parameter.POSITIONAL_OR_KEYWORD,
default=(0.0, 0.0, 0.0),
),
inspect.Parameter(
"q",
inspect.Parameter.POSITIONAL_OR_KEYWORD,
default=(0.0, 0.0, 0.0, 1.0),
),
),
)
_wp_type_params_ = [dtype]
_wp_generic_type_str_ = "transform_t"
_wp_constructor_ = "transformation"
def __init__(self, *args, **kwargs):
if len(args) == 1 and len(kwargs) == 0:
if getattr(args[0], "_wp_generic_type_str_") == self._wp_generic_type_str_:
# Copy constructor.
super().__init__(*args[0])
return
try:
# For backward compatibility, try to check if the arguments
# match the original signature that'd allow initializing
# the `p` and `q` components separately.
bound_args = self._wp_init_from_components_sig_.bind(*args, **kwargs)
bound_args.apply_defaults()
p, q = bound_args.args
except (TypeError, ValueError):
# Fallback to the vector's constructor.
super().__init__(*args)
return
# Even if the arguments match the original “from components”
# signature, we still need to make sure that they represent
# sequences that can be unpacked.
if hasattr(p, "__len__") and hasattr(q, "__len__"):
# Initialize from the `p` and `q` components.
super().__init__()
self[0:3] = vector(length=3, dtype=dtype)(*p)
self[3:7] = quaternion(dtype=dtype)(*q)
return
# Fallback to the vector's constructor.
super().__init__(*args)
@property
def p(self):
return vec3(self[0:3])
@property
def q(self):
return quat(self[3:7])
return transform_t
class transformh(transformation(dtype=float16)):
pass
class transformf(transformation(dtype=float32)):
pass
class transformd(transformation(dtype=float64)):
pass
class vec2h(vector(length=2, dtype=float16)):
pass
class vec3h(vector(length=3, dtype=float16)):
pass
class vec4h(vector(length=4, dtype=float16)):
pass
class vec2f(vector(length=2, dtype=float32)):
pass
class vec3f(vector(length=3, dtype=float32)):
pass
class vec4f(vector(length=4, dtype=float32)):
pass
class vec2d(vector(length=2, dtype=float64)):
pass
class vec3d(vector(length=3, dtype=float64)):
pass
class vec4d(vector(length=4, dtype=float64)):
pass
class vec2b(vector(length=2, dtype=int8)):
pass
class vec3b(vector(length=3, dtype=int8)):
pass
class vec4b(vector(length=4, dtype=int8)):
pass
class vec2ub(vector(length=2, dtype=uint8)):
pass
class vec3ub(vector(length=3, dtype=uint8)):
pass
class vec4ub(vector(length=4, dtype=uint8)):
pass
class vec2s(vector(length=2, dtype=int16)):
pass
class vec3s(vector(length=3, dtype=int16)):
pass
class vec4s(vector(length=4, dtype=int16)):
pass
class vec2us(vector(length=2, dtype=uint16)):
pass
class vec3us(vector(length=3, dtype=uint16)):
pass
class vec4us(vector(length=4, dtype=uint16)):
pass
class vec2i(vector(length=2, dtype=int32)):
pass
class vec3i(vector(length=3, dtype=int32)):
pass
class vec4i(vector(length=4, dtype=int32)):
pass
class vec2ui(vector(length=2, dtype=uint32)):
pass
class vec3ui(vector(length=3, dtype=uint32)):
pass
class vec4ui(vector(length=4, dtype=uint32)):
pass
class vec2l(vector(length=2, dtype=int64)):
pass
class vec3l(vector(length=3, dtype=int64)):
pass
class vec4l(vector(length=4, dtype=int64)):
pass
class vec2ul(vector(length=2, dtype=uint64)):
pass
class vec3ul(vector(length=3, dtype=uint64)):
pass
class vec4ul(vector(length=4, dtype=uint64)):
pass
class mat22h(matrix(shape=(2, 2), dtype=float16)):
pass
class mat33h(matrix(shape=(3, 3), dtype=float16)):
pass
class mat44h(matrix(shape=(4, 4), dtype=float16)):
pass
class mat22f(matrix(shape=(2, 2), dtype=float32)):
pass
class mat33f(matrix(shape=(3, 3), dtype=float32)):
pass
class mat44f(matrix(shape=(4, 4), dtype=float32)):
pass
class mat22d(matrix(shape=(2, 2), dtype=float64)):
pass
class mat33d(matrix(shape=(3, 3), dtype=float64)):
pass
class mat44d(matrix(shape=(4, 4), dtype=float64)):
pass
class spatial_vectorh(vector(length=6, dtype=float16)):
pass
class spatial_vectorf(vector(length=6, dtype=float32)):
pass
class spatial_vectord(vector(length=6, dtype=float64)):
pass
class spatial_matrixh(matrix(shape=(6, 6), dtype=float16)):
pass
class spatial_matrixf(matrix(shape=(6, 6), dtype=float32)):
pass
class spatial_matrixd(matrix(shape=(6, 6), dtype=float64)):
pass
# built-in type aliases that default to 32bit precision
vec2 = vec2f
vec3 = vec3f
vec4 = vec4f
mat22 = mat22f
mat33 = mat33f
mat44 = mat44f
quat = quatf
transform = transformf
spatial_vector = spatial_vectorf
spatial_matrix = spatial_matrixf
int_types = [int8, uint8, int16, uint16, int32, uint32, int64, uint64]
float_types = [float16, float32, float64]
scalar_types = int_types + float_types
vector_types = [
vec2b,
vec2ub,
vec2s,
vec2us,
vec2i,
vec2ui,
vec2l,
vec2ul,
vec2h,
vec2f,
vec2d,
vec3b,
vec3ub,
vec3s,
vec3us,
vec3i,
vec3ui,
vec3l,
vec3ul,
vec3h,
vec3f,
vec3d,
vec4b,
vec4ub,
vec4s,
vec4us,
vec4i,
vec4ui,
vec4l,
vec4ul,
vec4h,
vec4f,
vec4d,
mat22h,
mat22f,
mat22d,
mat33h,
mat33f,
mat33d,
mat44h,
mat44f,
mat44d,
quath,
quatf,
quatd,
transformh,
transformf,
transformd,
spatial_vectorh,
spatial_vectorf,
spatial_vectord,
spatial_matrixh,
spatial_matrixf,
spatial_matrixd,
]
np_dtype_to_warp_type = {
np.dtype(np.bool_): bool,
np.dtype(np.int8): int8,
np.dtype(np.uint8): uint8,
np.dtype(np.int16): int16,
np.dtype(np.uint16): uint16,
np.dtype(np.int32): int32,
np.dtype(np.int64): int64,
np.dtype(np.uint32): uint32,
np.dtype(np.uint64): uint64,
np.dtype(np.byte): int8,
np.dtype(np.ubyte): uint8,
np.dtype(np.float16): float16,
np.dtype(np.float32): float32,
np.dtype(np.float64): float64,
}
warp_type_to_np_dtype = {
bool: np.bool_,
int8: np.int8,
int16: np.int16,
int32: np.int32,
int64: np.int64,
uint8: np.uint8,
uint16: np.uint16,
uint32: np.uint32,
uint64: np.uint64,
float16: np.float16,
float32: np.float32,
float64: np.float64,
}
# represent a Python range iterator
class range_t:
def __init__(self):
pass
# definition just for kernel type (cannot be a parameter), see bvh.h
class bvh_query_t:
"""Object used to track state during BVH traversal."""
def __init__(self):
pass
# definition just for kernel type (cannot be a parameter), see mesh.h
class mesh_query_aabb_t:
"""Object used to track state during mesh traversal."""
def __init__(self):
pass
# definition just for kernel type (cannot be a parameter), see hash_grid.h
class hash_grid_query_t:
"""Object used to track state during neighbor traversal."""
def __init__(self):
pass
# maximum number of dimensions, must match array.h
ARRAY_MAX_DIMS = 4
LAUNCH_MAX_DIMS = 4
# must match array.h
ARRAY_TYPE_REGULAR = 0
ARRAY_TYPE_INDEXED = 1
ARRAY_TYPE_FABRIC = 2
ARRAY_TYPE_FABRIC_INDEXED = 3
# represents bounds for kernel launch (number of threads across multiple dimensions)
class launch_bounds_t(ctypes.Structure):
_fields_ = [("shape", ctypes.c_int32 * LAUNCH_MAX_DIMS), ("ndim", ctypes.c_int32), ("size", ctypes.c_size_t)]
def __init__(self, shape):
if isinstance(shape, int):
# 1d launch
self.ndim = 1
self.size = shape
self.shape[0] = shape
else:
# nd launch
self.ndim = len(shape)
self.size = 1
for i in range(self.ndim):
self.shape[i] = shape[i]
self.size = self.size * shape[i]
# initialize the remaining dims to 1
for i in range(self.ndim, LAUNCH_MAX_DIMS):
self.shape[i] = 1
class shape_t(ctypes.Structure):
_fields_ = [("dims", ctypes.c_int32 * ARRAY_MAX_DIMS)]
def __init__(self):
pass
class array_t(ctypes.Structure):
_fields_ = [
("data", ctypes.c_uint64),
("grad", ctypes.c_uint64),
("shape", ctypes.c_int32 * ARRAY_MAX_DIMS),
("strides", ctypes.c_int32 * ARRAY_MAX_DIMS),
("ndim", ctypes.c_int32),
]
def __init__(self, data=0, grad=0, ndim=0, shape=(0,), strides=(0,)):
self.data = data
self.grad = grad
self.ndim = ndim
for i in range(ndim):
self.shape[i] = shape[i]
self.strides[i] = strides[i]
# structured type description used when array_t is packed in a struct and shared via numpy structured array.
@classmethod
def numpy_dtype(cls):
return cls._numpy_dtype_
# structured value used when array_t is packed in a struct and shared via a numpy structured array
def numpy_value(self):
return (self.data, self.grad, list(self.shape), list(self.strides), self.ndim)
# NOTE: must match array_t._fields_
array_t._numpy_dtype_ = {
"names": ["data", "grad", "shape", "strides", "ndim"],
"formats": ["u8", "u8", f"{ARRAY_MAX_DIMS}i4", f"{ARRAY_MAX_DIMS}i4", "i4"],
"offsets": [
array_t.data.offset,
array_t.grad.offset,
array_t.shape.offset,
array_t.strides.offset,
array_t.ndim.offset,
],
"itemsize": ctypes.sizeof(array_t),
}
class indexedarray_t(ctypes.Structure):
_fields_ = [
("data", array_t),
("indices", ctypes.c_void_p * ARRAY_MAX_DIMS),
("shape", ctypes.c_int32 * ARRAY_MAX_DIMS),
]
def __init__(self, data, indices, shape):
if data is None:
self.data = array().__ctype__()
for i in range(ARRAY_MAX_DIMS):
self.indices[i] = ctypes.c_void_p(None)
self.shape[i] = 0
else:
self.data = data.__ctype__()
for i in range(data.ndim):
if indices[i] is not None:
self.indices[i] = ctypes.c_void_p(indices[i].ptr)
else:
self.indices[i] = ctypes.c_void_p(None)
self.shape[i] = shape[i]
def type_ctype(dtype):
if dtype == float:
return ctypes.c_float
elif dtype == int:
return ctypes.c_int32
else:
# scalar type
return dtype._type_
def type_length(dtype):
if dtype == float or dtype == int or isinstance(dtype, warp.codegen.Struct):
return 1
else:
return dtype._length_
def type_scalar_type(dtype):
return getattr(dtype, "_wp_scalar_type_", dtype)
def type_size_in_bytes(dtype):
if dtype.__module__ == "ctypes":
return ctypes.sizeof(dtype)
elif isinstance(dtype, warp.codegen.Struct):
return ctypes.sizeof(dtype.ctype)
elif dtype == float or dtype == int:
return 4
elif hasattr(dtype, "_type_"):
return getattr(dtype, "_length_", 1) * ctypes.sizeof(dtype._type_)
else:
return 0
def type_to_warp(dtype):
if dtype == float:
return float32
elif dtype == int:
return int32
else:
return dtype
def type_typestr(dtype):
if dtype == bool:
return "?"
elif dtype == float16:
return "<f2"
elif dtype == float32:
return "<f4"
elif dtype == float64:
return "<f8"
elif dtype == int8:
return "b"
elif dtype == uint8:
return "B"
elif dtype == int16:
return "<i2"
elif dtype == uint16:
return "<u2"
elif dtype == int32:
return "<i4"
elif dtype == uint32:
return "<u4"
elif dtype == int64:
return "<i8"
elif dtype == uint64:
return "<u8"
elif isinstance(dtype, warp.codegen.Struct):
return f"|V{ctypes.sizeof(dtype.ctype)}"
elif issubclass(dtype, ctypes.Array):
return type_typestr(dtype._wp_scalar_type_)
else:
raise Exception("Unknown ctype")
# converts any known type to a human readable string, good for error messages, reporting etc
def type_repr(t):
if is_array(t):
return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
if type_is_vector(t):
return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
if type_is_matrix(t):
return str(f"matrix(shape=({t._shape_[0]}, {t._shape_[1]}), dtype={t._wp_scalar_type_})")
if isinstance(t, warp.codegen.Struct):
return type_repr(t.cls)
if t in scalar_types:
return t.__name__
try:
return t.__module__ + "." + t.__qualname__
except AttributeError:
return str(t)
def type_is_int(t):
if t == int:
t = int32
return t in int_types
def type_is_float(t):
if t == float:
t = float32
return t in float_types
# returns True if the passed *type* is a vector
def type_is_vector(t):
if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "vec_t":
return True
else:
return False
# returns True if the passed *type* is a matrix
def type_is_matrix(t):
if hasattr(t, "_wp_generic_type_str_") and t._wp_generic_type_str_ == "mat_t":
return True
else:
return False
# returns true for all value types (int, float, bool, scalars, vectors, matrices)
def type_is_value(x):
if (x == int) or (x == float) or (x == builtins.bool) or (x in scalar_types) or issubclass(x, ctypes.Array):
return True
else:
return False
# equivalent of the above but for values
def is_int(x):
return type_is_int(type(x))
def is_float(x):
return type_is_float(type(x))
def is_value(x):
return type_is_value(type(x))
# returns true if the passed *instance* is one of the array types
def is_array(a):
return isinstance(a, array_types)
def types_equal(a, b, match_generic=False):
# convert to canonical types
if a == float:
a = float32
elif a == int:
a = int32
if b == float:
b = float32
elif b == int:
b = int32
compatible_bool_types = [builtins.bool, bool]
def are_equal(p1, p2):
if match_generic:
if p1 == Any or p2 == Any:
return True
if p1 == Scalar and p2 in scalar_types:
return True
if p2 == Scalar and p1 in scalar_types:
return True
if p1 == Scalar and p2 == Scalar:
return True
if p1 == Float and p2 in float_types:
return True
if p2 == Float and p1 in float_types:
return True
if p1 == Float and p2 == Float:
return True
# convert to canonical types
if p1 == float:
p1 = float32
elif p1 == int:
p1 = int32
if p2 == float:
p2 = float32
elif b == int:
p2 = int32
if p1 in compatible_bool_types and p2 in compatible_bool_types:
return True
else:
return p1 == p2
if (
hasattr(a, "_wp_generic_type_str_")
and hasattr(b, "_wp_generic_type_str_")
and a._wp_generic_type_str_ == b._wp_generic_type_str_
):
return all([are_equal(p1, p2) for p1, p2 in zip(a._wp_type_params_, b._wp_type_params_)])
if is_array(a) and type(a) is type(b):
return True
else:
return are_equal(a, b)
def strides_from_shape(shape: Tuple, dtype):
ndims = len(shape)
strides = [None] * ndims
i = ndims - 1
strides[i] = type_size_in_bytes(dtype)
while i > 0:
strides[i - 1] = strides[i] * shape[i]
i -= 1
return tuple(strides)
class array(Array):
# member attributes available during code-gen (e.g.: d = array.shape[0])
# (initialized when needed)
_vars = None
def __init__(
self,
data=None,
dtype: DType = Any,
shape=None,
strides=None,
length=None,
ptr=None,
capacity=None,
device=None,
pinned=False,
copy=True,
owner=True, # TODO: replace with deleter=None
ndim=None,
grad=None,
requires_grad=False,
):
"""Constructs a new Warp array object
When the ``data`` argument is a valid list, tuple, or ndarray the array will be constructed from this object's data.
For objects that are not stored sequentially in memory (e.g.: a list), then the data will first
be flattened before being transferred to the memory space given by device.
The second construction path occurs when the ``ptr`` argument is a non-zero uint64 value representing the
start address in memory where existing array data resides, e.g.: from an external or C-library. The memory
allocation should reside on the same device given by the device argument, and the user should set the length
and dtype parameter appropriately.
If neither ``data`` nor ``ptr`` are specified, the ``shape`` or ``length`` arguments are checked next.
This construction path can be used to create new uninitialized arrays, but users are encouraged to call
``wp.empty()``, ``wp.zeros()``, or ``wp.full()`` instead to create new arrays.
If none of the above arguments are specified, a simple type annotation is constructed. This is used when annotating
kernel arguments or struct members (e.g.,``arr: wp.array(dtype=float)``). In this case, only ``dtype`` and ``ndim``
are taken into account and no memory is allocated for the array.
Args:
data (Union[list, tuple, ndarray]) An object to construct the array from, can be a Tuple, List, or generally any type convertible to an np.array
dtype (Union): One of the built-in types, e.g.: :class:`warp.mat33`, if dtype is Any and data an ndarray then it will be inferred from the array data type
shape (tuple): Dimensions of the array
strides (tuple): Number of bytes in each dimension between successive elements of the array
length (int): Number of elements of the data type (deprecated, users should use `shape` argument)
ptr (uint64): Address of an external memory address to alias (data should be None)
capacity (int): Maximum size in bytes of the ptr allocation (data should be None)
device (Devicelike): Device the array lives on
copy (bool): Whether the incoming data will be copied or aliased, this is only possible when the incoming `data` already lives on the device specified and types match
owner (bool): Should the array object try to deallocate memory when it is deleted
requires_grad (bool): Whether or not gradients will be tracked for this array, see :class:`warp.Tape` for details
grad (array): The gradient array to use
pinned (bool): Whether to allocate pinned host memory, which allows asynchronous host-device transfers (only applicable with device="cpu")
"""
self.owner = False
self.ctype = None
self._requires_grad = False
self._grad = None
# __array_interface__ or __cuda_array_interface__, evaluated lazily and cached
self._array_interface = None
self.is_transposed = False
# canonicalize dtype
if dtype == int:
dtype = int32
elif dtype == float:
dtype = float32
# convert shape to tuple (or leave shape=None if neither shape nor length were specified)
if shape is not None:
if isinstance(shape, int):
shape = (shape,)
else:
shape = tuple(shape)
if len(shape) > ARRAY_MAX_DIMS:
raise RuntimeError(
f"Failed to create array with shape {shape}, the maximum number of dimensions is {ARRAY_MAX_DIMS}"
)
elif length is not None:
# backward compatibility
shape = (length,)
# determine the construction path from the given arguments
if data is not None:
# data or ptr, not both
if ptr is not None:
raise RuntimeError("Can only construct arrays with either `data` or `ptr` arguments, not both")
self._init_from_data(data, dtype, shape, device, copy, pinned)
elif ptr is not None:
self._init_from_ptr(ptr, dtype, shape, strides, capacity, device, owner, pinned)
elif shape is not None:
self._init_new(dtype, shape, strides, device, pinned)
else:
self._init_annotation(dtype, ndim or 1)
# initialize gradient, if needed
if self.device is not None:
if grad is not None:
# this will also check whether the gradient array is compatible
self.grad = grad
else:
# allocate gradient if needed
self._requires_grad = requires_grad
if requires_grad:
with warp.ScopedStream(self.device.null_stream):
self._alloc_grad()
def _init_from_data(self, data, dtype, shape, device, copy, pinned):
if not hasattr(data, "__len__"):
raise RuntimeError(f"Data must be a sequence or array, got scalar {data}")
if hasattr(dtype, "_wp_scalar_type_"):
dtype_shape = dtype._shape_
dtype_ndim = len(dtype_shape)
scalar_dtype = dtype._wp_scalar_type_
else:
dtype_shape = ()
dtype_ndim = 0
scalar_dtype = dtype
# convert input data to ndarray (handles lists, tuples, etc.) and determine dtype
if dtype == Any:
# infer dtype from data
try:
arr = np.array(data, copy=False, ndmin=1)
except Exception as e:
raise RuntimeError(f"Failed to convert input data to an array: {e}")
dtype = np_dtype_to_warp_type.get(arr.dtype)
if dtype is None:
raise RuntimeError(f"Unsupported input data dtype: {arr.dtype}")
elif isinstance(dtype, warp.codegen.Struct):
if isinstance(data, np.ndarray):
# construct from numpy structured array
if data.dtype != dtype.numpy_dtype():
raise RuntimeError(
f"Invalid source data type for array of structs, expected {dtype.numpy_dtype()}, got {data.dtype}"
)
arr = data
elif isinstance(data, (list, tuple)):
# construct from a sequence of structs
try:
# convert each struct instance to its corresponding ctype
ctype_list = [v.__ctype__() for v in data]
# convert the list of ctypes to a contiguous ctypes array
ctype_arr = (dtype.ctype * len(ctype_list))(*ctype_list)
# convert to numpy
arr = np.frombuffer(ctype_arr, dtype=dtype.ctype)
except Exception as e:
raise RuntimeError(
f"Error while trying to construct Warp array from a sequence of Warp structs: {e}"
)
else:
raise RuntimeError(
"Invalid data argument for array of structs, expected a sequence of structs or a NumPy structured array"
)
else:
# convert input data to the given dtype
npdtype = warp_type_to_np_dtype.get(scalar_dtype)
if npdtype is None:
raise RuntimeError(
f"Failed to convert input data to an array with Warp type {warp.context.type_str(dtype)}"
)
try:
arr = np.array(data, dtype=npdtype, copy=False, ndmin=1)
except Exception as e:
raise RuntimeError(f"Failed to convert input data to an array with type {npdtype}: {e}")
# determine whether the input needs reshaping
target_npshape = None
if shape is not None:
target_npshape = (*shape, *dtype_shape)
elif dtype_ndim > 0:
# prune inner dimensions of length 1
while arr.ndim > 1 and arr.shape[-1] == 1:
arr = np.squeeze(arr, axis=-1)
# if the inner dims don't match exactly, check if the innermost dim is a multiple of type length
if arr.ndim < dtype_ndim or arr.shape[-dtype_ndim:] != dtype_shape:
if arr.shape[-1] == dtype._length_:
target_npshape = (*arr.shape[:-1], *dtype_shape)
elif arr.shape[-1] % dtype._length_ == 0:
target_npshape = (*arr.shape[:-1], arr.shape[-1] // dtype._length_, *dtype_shape)
else:
if dtype_ndim == 1:
raise RuntimeError(
f"The inner dimensions of the input data are not compatible with the requested vector type {warp.context.type_str(dtype)}: expected an inner dimension that is a multiple of {dtype._length_}"
)
else:
raise RuntimeError(
f"The inner dimensions of the input data are not compatible with the requested matrix type {warp.context.type_str(dtype)}: expected inner dimensions {dtype._shape_} or a multiple of {dtype._length_}"
)
if target_npshape is not None:
try:
arr = arr.reshape(target_npshape)
except Exception as e:
raise RuntimeError(
f"Failed to reshape the input data to the given shape {shape} and type {warp.context.type_str(dtype)}: {e}"
)
# determine final shape and strides
if dtype_ndim > 0:
# make sure the inner dims are contiguous for vector/matrix types
scalar_size = type_size_in_bytes(dtype._wp_scalar_type_)
inner_contiguous = arr.strides[-1] == scalar_size
if inner_contiguous and dtype_ndim > 1:
inner_contiguous = arr.strides[-2] == scalar_size * dtype_shape[-1]
if not inner_contiguous:
arr = np.ascontiguousarray(arr)
shape = arr.shape[:-dtype_ndim] or (1,)
strides = arr.strides[:-dtype_ndim] or (type_size_in_bytes(dtype),)
else:
shape = arr.shape or (1,)
strides = arr.strides or (type_size_in_bytes(dtype),)
device = warp.get_device(device)
if device.is_cpu and not copy and not pinned:
# reference numpy memory directly
self._init_from_ptr(arr.ctypes.data, dtype, shape, strides, None, device, False, False)
# keep a ref to the source array to keep allocation alive
self._ref = arr
else:
# copy data into a new array
self._init_new(dtype, shape, None, device, pinned)
src = array(
ptr=arr.ctypes.data,
dtype=dtype,
shape=shape,
strides=strides,
device="cpu",
copy=False,
owner=False,
)
warp.copy(self, src)
def _init_from_ptr(self, ptr, dtype, shape, strides, capacity, device, owner, pinned):
if dtype == Any:
raise RuntimeError("A concrete data type is required to create the array")
device = warp.get_device(device)
size = 1
for d in shape:
size *= d
contiguous_strides = strides_from_shape(shape, dtype)
if strides is None:
strides = contiguous_strides
is_contiguous = True
if capacity is None:
capacity = size * type_size_in_bytes(dtype)
else:
is_contiguous = strides == contiguous_strides
if capacity is None:
capacity = shape[0] * strides[0]
self.dtype = dtype
self.ndim = len(shape)
self.size = size
self.capacity = capacity
self.shape = shape
self.strides = strides
self.ptr = ptr
self.device = device
self.owner = owner
self.pinned = pinned if device.is_cpu else False
self.is_contiguous = is_contiguous
def _init_new(self, dtype, shape, strides, device, pinned):
if dtype == Any:
raise RuntimeError("A concrete data type is required to create the array")
device = warp.get_device(device)
size = 1
for d in shape:
size *= d
contiguous_strides = strides_from_shape(shape, dtype)
if strides is None:
strides = contiguous_strides
is_contiguous = True
capacity = size * type_size_in_bytes(dtype)
else:
is_contiguous = strides == contiguous_strides
capacity = shape[0] * strides[0]
if capacity > 0:
ptr = device.allocator.alloc(capacity, pinned=pinned)
if ptr is None:
raise RuntimeError(f"Array allocation failed on device: {device} for {capacity} bytes")
else:
ptr = None
self.dtype = dtype
self.ndim = len(shape)
self.size = size
self.capacity = capacity
self.shape = shape
self.strides = strides
self.ptr = ptr
self.device = device
self.owner = True
self.pinned = pinned if device.is_cpu else False
self.is_contiguous = is_contiguous
def _init_annotation(self, dtype, ndim):
self.dtype = dtype
self.ndim = ndim
self.size = 0
self.capacity = 0
self.shape = (0,) * ndim
self.strides = (0,) * ndim
self.ptr = None
self.device = None
self.owner = False
self.pinned = False
self.is_contiguous = False
@property
def __array_interface__(self):
# raising an AttributeError here makes hasattr() return False
if self.device is None or not self.device.is_cpu:
raise AttributeError(f"__array_interface__ not supported because device is {self.device}")
if self._array_interface is None:
# get flat shape (including type shape)
if isinstance(self.dtype, warp.codegen.Struct):
# struct
arr_shape = self.shape
arr_strides = self.strides
descr = self.dtype.numpy_dtype()
elif issubclass(self.dtype, ctypes.Array):
# vector type, flatten the dimensions into one tuple
arr_shape = (*self.shape, *self.dtype._shape_)
dtype_strides = strides_from_shape(self.dtype._shape_, self.dtype._type_)
arr_strides = (*self.strides, *dtype_strides)
descr = None
else:
# scalar type
arr_shape = self.shape
arr_strides = self.strides
descr = None
self._array_interface = {
"data": (self.ptr if self.ptr is not None else 0, False),
"shape": tuple(arr_shape),
"strides": tuple(arr_strides),
"typestr": type_typestr(self.dtype),
"descr": descr, # optional description of structured array layout
"version": 3,
}
return self._array_interface
@property
def __cuda_array_interface__(self):
# raising an AttributeError here makes hasattr() return False
if self.device is None or not self.device.is_cuda:
raise AttributeError(f"__cuda_array_interface__ is not supported because device is {self.device}")
if self._array_interface is None:
# get flat shape (including type shape)
if issubclass(self.dtype, ctypes.Array):
# vector type, flatten the dimensions into one tuple
arr_shape = (*self.shape, *self.dtype._shape_)
dtype_strides = strides_from_shape(self.dtype._shape_, self.dtype._type_)
arr_strides = (*self.strides, *dtype_strides)
else:
# scalar or struct type
arr_shape = self.shape
arr_strides = self.strides
self._array_interface = {
"data": (self.ptr if self.ptr is not None else 0, False),
"shape": tuple(arr_shape),
"strides": tuple(arr_strides),
"typestr": type_typestr(self.dtype),
"version": 2,
}
return self._array_interface
def __del__(self):
if self.owner:
# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
self.device.allocator.free(self.ptr, self.capacity, self.pinned)
def __len__(self):
return self.shape[0]
def __str__(self):
if self.device is None:
# for 'empty' arrays we just return the type information, these are used in kernel function signatures
return f"array{self.dtype}"
else:
return str(self.numpy())
def __getitem__(self, key):
if isinstance(key, int):
if self.ndim == 1:
raise RuntimeError("Item indexing is not supported on wp.array objects")
key = [key]
elif isinstance(key, (slice, array)):
key = [key]
elif isinstance(key, Tuple):
contains_slice = False
contains_indices = False
for k in key:
if isinstance(k, slice):
contains_slice = True
if isinstance(k, array):
contains_indices = True
if not contains_slice and not contains_indices and len(key) == self.ndim:
raise RuntimeError("Item indexing is not supported on wp.array objects")
else:
raise RuntimeError(f"Invalid index: {key}")
new_key = []
for i in range(0, len(key)):
new_key.append(key[i])
for i in range(len(key), self.ndim):
new_key.append(slice(None, None, None))
key = tuple(new_key)
new_shape = []
new_strides = []
ptr_offset = 0
new_dim = self.ndim
# maps dimension index to an array of indices, if given
index_arrays = {}
for idx, k in enumerate(key):
if isinstance(k, slice):
start, stop, step = k.start, k.stop, k.step
if start is None:
start = 0
if stop is None:
stop = self.shape[idx]
if step is None:
step = 1
if start < 0:
start = self.shape[idx] + start
if stop < 0:
stop = self.shape[idx] + stop
if start < 0 or start >= self.shape[idx]:
raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
if stop < 1 or stop > self.shape[idx]:
raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
if stop <= start:
raise RuntimeError(f"Invalid indexing in slice: {start}:{stop}:{step}")
new_shape.append(-((stop - start) // -step)) # ceil division
new_strides.append(self.strides[idx] * step)
ptr_offset += self.strides[idx] * start
elif isinstance(k, array):
# note: index array properties will be checked during indexedarray construction
index_arrays[idx] = k
# shape and strides are unchanged for this dimension
new_shape.append(self.shape[idx])
new_strides.append(self.strides[idx])
else: # is int
start = k
if start < 0:
start = self.shape[idx] + start
if start < 0 or start >= self.shape[idx]:
raise RuntimeError(f"Invalid indexing in slice: {k}")
new_dim -= 1
ptr_offset += self.strides[idx] * start
# handle grad
if self.grad is not None:
new_grad = array(
ptr=self.grad.ptr + ptr_offset if self.grad.ptr is not None else None,
dtype=self.grad.dtype,
shape=tuple(new_shape),
strides=tuple(new_strides),
device=self.grad.device,
pinned=self.grad.pinned,
owner=False,
)
# store back-ref to stop data being destroyed
new_grad._ref = self.grad
else:
new_grad = None
a = array(
ptr=self.ptr + ptr_offset if self.ptr is not None else None,
dtype=self.dtype,
shape=tuple(new_shape),
strides=tuple(new_strides),
device=self.device,
pinned=self.pinned,
owner=False,
grad=new_grad,
)
# store back-ref to stop data being destroyed
a._ref = self
if index_arrays:
indices = [None] * self.ndim
for dim, index_array in index_arrays.items():
indices[dim] = index_array
return indexedarray(a, indices)
else:
return a
# construct a C-representation of the array for passing to kernels
def __ctype__(self):
if self.ctype is None:
data = 0 if self.ptr is None else ctypes.c_uint64(self.ptr)
grad = 0 if self.grad is None or self.grad.ptr is None else ctypes.c_uint64(self.grad.ptr)
self.ctype = array_t(data=data, grad=grad, ndim=self.ndim, shape=self.shape, strides=self.strides)
return self.ctype
def __matmul__(self, other):
"""
Enables A @ B syntax for matrix multiplication
"""
if self.ndim != 2 or other.ndim != 2:
raise RuntimeError(
"A has dim = {}, B has dim = {}. If multiplying with @, A and B must have dim = 2.".format(
self.ndim, other.ndim
)
)
m = self.shape[0]
n = other.shape[1]
c = warp.zeros(shape=(m, n), dtype=self.dtype, device=self.device, requires_grad=True)
d = warp.zeros(shape=(m, n), dtype=self.dtype, device=self.device, requires_grad=True)
matmul(self, other, c, d, device=self.device)
return d
@property
def grad(self):
return self._grad
@grad.setter
def grad(self, grad):
if grad is None:
self._grad = None
self._requires_grad = False
else:
# make sure the given gradient array is compatible
if (
grad.dtype != self.dtype
or grad.shape != self.shape
or grad.strides != self.strides
or grad.device != self.device
):
raise ValueError("The given gradient array is incompatible")
self._grad = grad
self._requires_grad = True
# trigger re-creation of C-representation
self.ctype = None
@property
def requires_grad(self):
return self._requires_grad
@requires_grad.setter
def requires_grad(self, value: builtins.bool):
if value and self._grad is None:
self._alloc_grad()
elif not value:
self._grad = None
self._requires_grad = value
# trigger re-creation of C-representation
self.ctype = None
def _alloc_grad(self):
self._grad = array(
dtype=self.dtype, shape=self.shape, strides=self.strides, device=self.device, pinned=self.pinned
)
self._grad.zero_()
# trigger re-creation of C-representation
self.ctype = None
@property
def vars(self):
# member attributes available during code-gen (e.g.: d = array.shape[0])
# Note: we use a shared dict for all array instances
if array._vars is None:
array._vars = {"shape": warp.codegen.Var("shape", shape_t)}
return array._vars
def zero_(self):
"""Zeroes-out the array entires."""
if self.is_contiguous:
# simple memset is usually faster than generic fill
self.device.memset(self.ptr, 0, self.size * type_size_in_bytes(self.dtype))
else:
self.fill_(0)
def fill_(self, value):
"""Set all array entries to `value`
args:
value: The value to set every array entry to. Must be convertible to the array's ``dtype``.
Raises:
ValueError: If `value` cannot be converted to the array's ``dtype``.
Examples:
``fill_()`` can take lists or other sequences when filling arrays of vectors or matrices.
>>> arr = wp.zeros(2, dtype=wp.mat22)
>>> arr.numpy()
array([[[0., 0.],
[0., 0.]],
<BLANKLINE>
[[0., 0.],
[0., 0.]]], dtype=float32)
>>> arr.fill_([[1, 2], [3, 4]])
>>> arr.numpy()
array([[[1., 2.],
[3., 4.]],
<BLANKLINE>
[[1., 2.],
[3., 4.]]], dtype=float32)
"""
if self.size == 0:
return
# try to convert the given value to the array dtype
try:
if isinstance(self.dtype, warp.codegen.Struct):
if isinstance(value, self.dtype.cls):
cvalue = value.__ctype__()
elif value == 0:
# allow zero-initializing structs using default constructor
cvalue = self.dtype().__ctype__()
else:
raise ValueError(
f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
)
elif issubclass(self.dtype, ctypes.Array):
# vector/matrix
cvalue = self.dtype(value)
else:
# scalar
if type(value) in warp.types.scalar_types:
value = value.value
if self.dtype == float16:
cvalue = self.dtype._type_(float_to_half_bits(value))
else:
cvalue = self.dtype._type_(value)
except Exception as e:
raise ValueError(f"Failed to convert the value to the array data type: {e}")
cvalue_ptr = ctypes.pointer(cvalue)
cvalue_size = ctypes.sizeof(cvalue)
# prefer using memtile for contiguous arrays, because it should be faster than generic fill
if self.is_contiguous:
self.device.memtile(self.ptr, cvalue_ptr, cvalue_size, self.size)
else:
carr = self.__ctype__()
carr_ptr = ctypes.pointer(carr)
if self.device.is_cuda:
warp.context.runtime.core.array_fill_device(
self.device.context, carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size
)
else:
warp.context.runtime.core.array_fill_host(carr_ptr, ARRAY_TYPE_REGULAR, cvalue_ptr, cvalue_size)
def assign(self, src):
"""Wraps ``src`` in an :class:`warp.array` if it is not already one and copies the contents to ``self``."""
if is_array(src):
warp.copy(self, src)
else:
warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
def numpy(self):
"""Converts the array to a :class:`numpy.ndarray` (aliasing memory through the array interface protocol)
If the array is on the GPU, a synchronous device-to-host copy (on the CUDA default stream) will be
automatically performed to ensure that any outstanding work is completed.
"""
if self.ptr:
# use the CUDA default stream for synchronous behaviour with other streams
with warp.ScopedStream(self.device.null_stream):
a = self.to("cpu", requires_grad=False)
# convert through __array_interface__
# Note: this handles arrays of structs using `descr`, so the result will be a structured NumPy array
return np.array(a, copy=False)
else:
# return an empty numpy array with the correct dtype and shape
if isinstance(self.dtype, warp.codegen.Struct):
npdtype = self.dtype.numpy_dtype()
npshape = self.shape
elif issubclass(self.dtype, ctypes.Array):
npdtype = warp_type_to_np_dtype[self.dtype._wp_scalar_type_]
npshape = (*self.shape, *self.dtype._shape_)
else:
npdtype = warp_type_to_np_dtype[self.dtype]
npshape = self.shape
return np.empty(npshape, dtype=npdtype)
def cptr(self):
"""Return a ctypes cast of the array address.
Notes:
#. Only CPU arrays support this method.
#. The array must be contiguous.
#. Accesses to this object are **not** bounds checked.
#. For ``float16`` types, a pointer to the internal ``uint16`` representation is returned.
"""
if not self.ptr:
return None
if self.device != "cpu" or not self.is_contiguous:
raise RuntimeError(
"Accessing array memory through a ctypes ptr is only supported for contiguous CPU arrays."
)
if isinstance(self.dtype, warp.codegen.Struct):
p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype.ctype))
else:
p = ctypes.cast(self.ptr, ctypes.POINTER(self.dtype._type_))
# store backref to the underlying array to avoid it being deallocated
p._ref = self
return p
def list(self):
"""Returns a flattened list of items in the array as a Python list."""
a = self.numpy()
if isinstance(self.dtype, warp.codegen.Struct):
# struct
a = a.flatten()
data = a.ctypes.data
stride = a.strides[0]
return [self.dtype.from_ptr(data + i * stride) for i in range(self.size)]
elif issubclass(self.dtype, ctypes.Array):
# vector/matrix - flatten, but preserve inner vector/matrix dimensions
a = a.reshape((self.size, *self.dtype._shape_))
data = a.ctypes.data
stride = a.strides[0]
return [self.dtype.from_ptr(data + i * stride) for i in range(self.size)]
else:
# scalar
return list(a.flatten())
def to(self, device, requires_grad=None):
"""Returns a Warp array with this array's data moved to the specified device, no-op if already on device."""
device = warp.get_device(device)
if self.device == device:
return self
else:
return warp.clone(self, device=device, requires_grad=requires_grad)
def flatten(self):
"""Returns a zero-copy view of the array collapsed to 1-D. Only supported for contiguous arrays."""
if self.ndim == 1:
return self
if not self.is_contiguous:
raise RuntimeError("Flattening non-contiguous arrays is unsupported.")
a = array(
ptr=self.ptr,
dtype=self.dtype,
shape=(self.size,),
device=self.device,
pinned=self.pinned,
copy=False,
owner=False,
grad=None if self.grad is None else self.grad.flatten(),
)
# store back-ref to stop data being destroyed
a._ref = self
return a
def reshape(self, shape):
"""Returns a reshaped array. Only supported for contiguous arrays.
Args:
shape : An int or tuple of ints specifying the shape of the returned array.
"""
if not self.is_contiguous:
raise RuntimeError("Reshaping non-contiguous arrays is unsupported.")
# convert shape to tuple
if shape is None:
raise RuntimeError("shape parameter is required.")
if isinstance(shape, int):
shape = (shape,)
elif not isinstance(shape, tuple):
shape = tuple(shape)
if len(shape) > ARRAY_MAX_DIMS:
raise RuntimeError(
f"Arrays may only have {ARRAY_MAX_DIMS} dimensions maximum, trying to create array with {len(shape)} dims."
)
# check for -1 dimension and reformat
if -1 in shape:
idx = self.size
denom = 1
minus_one_count = 0
for i, d in enumerate(shape):
if d == -1:
idx = i
minus_one_count += 1
else:
denom *= d
if minus_one_count > 1:
raise RuntimeError("Cannot infer shape if more than one index is -1.")
new_shape = list(shape)
new_shape[idx] = int(self.size / denom)
shape = tuple(new_shape)
size = 1
for d in shape:
size *= d
if size != self.size:
raise RuntimeError("Reshaped array must have the same total size as the original.")
a = array(
ptr=self.ptr,
dtype=self.dtype,
shape=shape,
strides=None,
device=self.device,
pinned=self.pinned,
copy=False,
owner=False,
grad=None if self.grad is None else self.grad.reshape(shape),
)
# store back-ref to stop data being destroyed
a._ref = self
return a
def view(self, dtype):
"""Returns a zero-copy view of this array's memory with a different data type.
``dtype`` must have the same byte size of the array's native ``dtype``.
"""
if type_size_in_bytes(dtype) != type_size_in_bytes(self.dtype):
raise RuntimeError("Cannot cast dtypes of unequal byte size")
# return an alias of the array memory with different type information
a = array(
ptr=self.ptr,
dtype=dtype,
shape=self.shape,
strides=self.strides,
device=self.device,
pinned=self.pinned,
copy=False,
owner=False,
grad=None if self.grad is None else self.grad.view(dtype),
)
a._ref = self
return a
def contiguous(self):
"""Returns a contiguous array with this array's data. No-op if array is already contiguous."""
if self.is_contiguous:
return self
a = warp.empty_like(self)
warp.copy(a, self)
return a
def transpose(self, axes=None):
"""Returns an zero-copy view of the array with axes transposed.
Note: The transpose operation will return an array with a non-contiguous access pattern.
Args:
axes (optional): Specifies the how the axes are permuted. If not specified, the axes order will be reversed.
"""
# noop if 1d array
if self.ndim == 1:
return self
if axes is None:
# reverse the order of the axes
axes = range(self.ndim)[::-1]
elif len(axes) != len(self.shape):
raise RuntimeError("Length of parameter axes must be equal in length to array shape")
shape = []
strides = []
for a in axes:
if not isinstance(a, int):
raise RuntimeError(f"axis index {a} is not of type int")
if a >= len(self.shape):
raise RuntimeError(f"axis index {a} must be smaller than the number of axes in array")
shape.append(self.shape[a])
strides.append(self.strides[a])
a = array(
ptr=self.ptr,
dtype=self.dtype,
shape=tuple(shape),
strides=tuple(strides),
device=self.device,
pinned=self.pinned,
copy=False,
owner=False,
grad=None if self.grad is None else self.grad.transpose(axes=axes),
)
a.is_transposed = not self.is_transposed
a._ref = self
return a
# aliases for arrays with small dimensions
def array1d(*args, **kwargs):
kwargs["ndim"] = 1
return array(*args, **kwargs)
# equivalent to calling array(..., ndim=2)
def array2d(*args, **kwargs):
kwargs["ndim"] = 2
return array(*args, **kwargs)
# equivalent to calling array(..., ndim=3)
def array3d(*args, **kwargs):
kwargs["ndim"] = 3
return array(*args, **kwargs)
# equivalent to calling array(..., ndim=4)
def array4d(*args, **kwargs):
kwargs["ndim"] = 4
return array(*args, **kwargs)
# TODO: Rewrite so that we take only shape, not length and optional shape
def from_ptr(ptr, length, dtype=None, shape=None, device=None):
return array(
dtype=dtype,
length=length,
capacity=length * type_size_in_bytes(dtype),
ptr=0 if ptr == 0 else ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
shape=shape,
device=device,
owner=False,
requires_grad=False,
)
# A base class for non-contiguous arrays, providing the implementation of common methods like
# contiguous(), to(), numpy(), list(), assign(), zero_(), and fill_().
class noncontiguous_array_base(Generic[T]):
def __init__(self, array_type_id):
self.type_id = array_type_id
self.is_contiguous = False
# return a contiguous copy
def contiguous(self):
a = warp.empty_like(self)
warp.copy(a, self)
return a
# copy data from one device to another, nop if already on device
def to(self, device):
device = warp.get_device(device)
if self.device == device:
return self
else:
return warp.clone(self, device=device)
# return a contiguous numpy copy
def numpy(self):
# use the CUDA default stream for synchronous behaviour with other streams
with warp.ScopedStream(self.device.null_stream):
return self.contiguous().numpy()
# returns a flattened list of items in the array as a Python list
def list(self):
# use the CUDA default stream for synchronous behaviour with other streams
with warp.ScopedStream(self.device.null_stream):
return self.contiguous().list()
# equivalent to wrapping src data in an array and copying to self
def assign(self, src):
if is_array(src):
warp.copy(self, src)
else:
warp.copy(self, array(data=src, dtype=self.dtype, copy=False, device="cpu"))
def zero_(self):
self.fill_(0)
def fill_(self, value):
if self.size == 0:
return
# try to convert the given value to the array dtype
try:
if isinstance(self.dtype, warp.codegen.Struct):
if isinstance(value, self.dtype.cls):
cvalue = value.__ctype__()
elif value == 0:
# allow zero-initializing structs using default constructor
cvalue = self.dtype().__ctype__()
else:
raise ValueError(
f"Invalid initializer value for struct {self.dtype.cls.__name__}, expected struct instance or 0"
)
elif issubclass(self.dtype, ctypes.Array):
# vector/matrix
cvalue = self.dtype(value)
else:
# scalar
if type(value) in warp.types.scalar_types:
value = value.value
if self.dtype == float16:
cvalue = self.dtype._type_(float_to_half_bits(value))
else:
cvalue = self.dtype._type_(value)
except Exception as e:
raise ValueError(f"Failed to convert the value to the array data type: {e}")
cvalue_ptr = ctypes.pointer(cvalue)
cvalue_size = ctypes.sizeof(cvalue)
ctype = self.__ctype__()
ctype_ptr = ctypes.pointer(ctype)
if self.device.is_cuda:
warp.context.runtime.core.array_fill_device(
self.device.context, ctype_ptr, self.type_id, cvalue_ptr, cvalue_size
)
else:
warp.context.runtime.core.array_fill_host(ctype_ptr, self.type_id, cvalue_ptr, cvalue_size)
# helper to check index array properties
def check_index_array(indices, expected_device):
if not isinstance(indices, array):
raise ValueError(f"Indices must be a Warp array, got {type(indices)}")
if indices.ndim != 1:
raise ValueError(f"Index array must be one-dimensional, got {indices.ndim}")
if indices.dtype != int32:
raise ValueError(f"Index array must use int32, got dtype {indices.dtype}")
if indices.device != expected_device:
raise ValueError(f"Index array device ({indices.device} does not match data array device ({expected_device}))")
class indexedarray(noncontiguous_array_base[T]):
# member attributes available during code-gen (e.g.: d = arr.shape[0])
# (initialized when needed)
_vars = None
def __init__(self, data: array = None, indices: Union[array, List[array]] = None, dtype=None, ndim=None):
super().__init__(ARRAY_TYPE_INDEXED)
# canonicalize types
if dtype is not None:
if dtype == int:
dtype = int32
elif dtype == float:
dtype = float32
self.data = data
self.indices = [None] * ARRAY_MAX_DIMS
if data is not None:
if not isinstance(data, array):
raise ValueError("Indexed array data must be a Warp array")
if dtype is not None and dtype != data.dtype:
raise ValueError(f"Requested dtype ({dtype}) does not match dtype of data array ({data.dtype})")
if ndim is not None and ndim != data.ndim:
raise ValueError(
f"Requested dimensionality ({ndim}) does not match dimensionality of data array ({data.ndim})"
)
self.dtype = data.dtype
self.ndim = data.ndim
self.device = data.device
self.pinned = data.pinned
# determine shape from original data shape and index counts
shape = list(data.shape)
if indices is not None:
if isinstance(indices, (list, tuple)):
if len(indices) > self.ndim:
raise ValueError(
f"Number of indices provided ({len(indices)}) exceeds number of dimensions ({self.ndim})"
)
for i in range(len(indices)):
if indices[i] is not None:
check_index_array(indices[i], data.device)
self.indices[i] = indices[i]
shape[i] = len(indices[i])
elif isinstance(indices, array):
# only a single index array was provided
check_index_array(indices, data.device)
self.indices[0] = indices
shape[0] = len(indices)
else:
raise ValueError("Indices must be a single Warp array or a list of Warp arrays")
self.shape = tuple(shape)
else:
# allow empty indexedarrays in type annotations
self.dtype = dtype
self.ndim = ndim or 1
self.device = None
self.pinned = False
self.shape = (0,) * self.ndim
# update size (num elements)
self.size = 1
for d in self.shape:
self.size *= d
def __len__(self):
return self.shape[0]
def __str__(self):
if self.device is None:
# type annotation
return f"indexedarray{self.dtype}"
else:
return str(self.numpy())
# construct a C-representation of the array for passing to kernels
def __ctype__(self):
return indexedarray_t(self.data, self.indices, self.shape)
@property
def vars(self):
# member attributes available during code-gen (e.g.: d = arr.shape[0])
# Note: we use a shared dict for all indexedarray instances
if indexedarray._vars is None:
indexedarray._vars = {"shape": warp.codegen.Var("shape", shape_t)}
return indexedarray._vars
# aliases for indexedarrays with small dimensions
def indexedarray1d(*args, **kwargs):
kwargs["ndim"] = 1
return indexedarray(*args, **kwargs)
# equivalent to calling indexedarray(..., ndim=2)
def indexedarray2d(*args, **kwargs):
kwargs["ndim"] = 2
return indexedarray(*args, **kwargs)
# equivalent to calling indexedarray(..., ndim=3)
def indexedarray3d(*args, **kwargs):
kwargs["ndim"] = 3
return indexedarray(*args, **kwargs)
# equivalent to calling indexedarray(..., ndim=4)
def indexedarray4d(*args, **kwargs):
kwargs["ndim"] = 4
return indexedarray(*args, **kwargs)
from warp.fabric import fabricarray, indexedfabricarray # noqa: E402
array_types = (array, indexedarray, fabricarray, indexedfabricarray)
def array_type_id(a):
if isinstance(a, array):
return ARRAY_TYPE_REGULAR
elif isinstance(a, indexedarray):
return ARRAY_TYPE_INDEXED
elif isinstance(a, fabricarray):
return ARRAY_TYPE_FABRIC
elif isinstance(a, indexedfabricarray):
return ARRAY_TYPE_FABRIC_INDEXED
else:
raise ValueError("Invalid array type")
class Bvh:
def __init__(self, lowers, uppers):
"""Class representing a bounding volume hierarchy.
Attributes:
id: Unique identifier for this bvh object, can be passed to kernels.
device: Device this object lives on, all buffers must live on the same device.
Args:
lowers (:class:`warp.array`): Array of lower bounds :class:`warp.vec3`
uppers (:class:`warp.array`): Array of upper bounds :class:`warp.vec3`
"""
if len(lowers) != len(uppers):
raise RuntimeError("Bvh the same number of lower and upper bounds must be provided")
if lowers.device != uppers.device:
raise RuntimeError("Bvh lower and upper bounds must live on the same device")
if lowers.dtype != vec3 or not lowers.is_contiguous:
raise RuntimeError("Bvh lowers should be a contiguous array of type wp.vec3")
if uppers.dtype != vec3 or not uppers.is_contiguous:
raise RuntimeError("Bvh uppers should be a contiguous array of type wp.vec3")
self.device = lowers.device
self.lowers = lowers
self.uppers = uppers
def get_data(array):
if array:
return ctypes.c_void_p(array.ptr)
else:
return ctypes.c_void_p(0)
from warp.context import runtime
if self.device.is_cpu:
self.id = runtime.core.bvh_create_host(get_data(lowers), get_data(uppers), int(len(lowers)))
else:
self.id = runtime.core.bvh_create_device(
self.device.context, get_data(lowers), get_data(uppers), int(len(lowers))
)
def __del__(self):
try:
from warp.context import runtime
if self.device.is_cpu:
runtime.core.bvh_destroy_host(self.id)
else:
# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
runtime.core.bvh_destroy_device(self.id)
except Exception:
pass
def refit(self):
"""Refit the BVH. This should be called after users modify the `lowers` and `uppers` arrays."""
from warp.context import runtime
if self.device.is_cpu:
runtime.core.bvh_refit_host(self.id)
else:
runtime.core.bvh_refit_device(self.id)
runtime.verify_cuda_device(self.device)
class Mesh:
from warp.codegen import Var
vars = {
"points": Var("points", array(dtype=vec3)),
"velocities": Var("velocities", array(dtype=vec3)),
"indices": Var("indices", array(dtype=int32)),
}
def __init__(self, points=None, indices=None, velocities=None, support_winding_number=False):
"""Class representing a triangle mesh.
Attributes:
id: Unique identifier for this mesh object, can be passed to kernels.
device: Device this object lives on, all buffers must live on the same device.
Args:
points (:class:`warp.array`): Array of vertex positions of type :class:`warp.vec3`
indices (:class:`warp.array`): Array of triangle indices of type :class:`warp.int32`, should be a 1d array with shape (num_tris, 3)
velocities (:class:`warp.array`): Array of vertex velocities of type :class:`warp.vec3` (optional)
support_winding_number (bool): If true the mesh will build additional datastructures to support `wp.mesh_query_point_sign_winding_number()` queries
"""
if points.device != indices.device:
raise RuntimeError("Mesh points and indices must live on the same device")
if points.dtype != vec3 or not points.is_contiguous:
raise RuntimeError("Mesh points should be a contiguous array of type wp.vec3")
if velocities and (velocities.dtype != vec3 or not velocities.is_contiguous):
raise RuntimeError("Mesh velocities should be a contiguous array of type wp.vec3")
if indices.dtype != int32 or not indices.is_contiguous:
raise RuntimeError("Mesh indices should be a contiguous array of type wp.int32")
if indices.ndim > 1:
raise RuntimeError("Mesh indices should be a flattened 1d array of indices")
self.device = points.device
self.points = points
self.velocities = velocities
self.indices = indices
from warp.context import runtime
if self.device.is_cpu:
self.id = runtime.core.mesh_create_host(
points.__ctype__(),
velocities.__ctype__() if velocities else array().__ctype__(),
indices.__ctype__(),
int(len(points)),
int(indices.size / 3),
int(support_winding_number),
)
else:
self.id = runtime.core.mesh_create_device(
self.device.context,
points.__ctype__(),
velocities.__ctype__() if velocities else array().__ctype__(),
indices.__ctype__(),
int(len(points)),
int(indices.size / 3),
int(support_winding_number),
)
def __del__(self):
try:
from warp.context import runtime
if self.device.is_cpu:
runtime.core.mesh_destroy_host(self.id)
else:
# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
runtime.core.mesh_destroy_device(self.id)
except Exception:
pass
def refit(self):
"""Refit the BVH to points. This should be called after users modify the `points` data."""
from warp.context import runtime
if self.device.is_cpu:
runtime.core.mesh_refit_host(self.id)
else:
runtime.core.mesh_refit_device(self.id)
runtime.verify_cuda_device(self.device)
class Volume:
#: Enum value to specify nearest-neighbor interpolation during sampling
CLOSEST = constant(0)
#: Enum value to specify trilinear interpolation during sampling
LINEAR = constant(1)
def __init__(self, data: array):
"""Class representing a sparse grid.
Args:
data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
"""
self.id = 0
from warp.context import runtime
self.context = runtime
if data is None:
return
if data.device is None:
raise RuntimeError("Invalid device")
self.device = data.device
if self.device.is_cpu:
self.id = self.context.core.volume_create_host(ctypes.cast(data.ptr, ctypes.c_void_p), data.size)
else:
self.id = self.context.core.volume_create_device(
self.device.context, ctypes.cast(data.ptr, ctypes.c_void_p), data.size
)
if self.id == 0:
raise RuntimeError("Failed to create volume from input array")
def __del__(self):
if self.id == 0:
return
try:
from warp.context import runtime
if self.device.is_cpu:
runtime.core.volume_destroy_host(self.id)
else:
# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
runtime.core.volume_destroy_device(self.id)
except Exception:
pass
def array(self) -> array:
"""Returns the raw memory buffer of the Volume as an array"""
buf = ctypes.c_void_p(0)
size = ctypes.c_uint64(0)
if self.device.is_cpu:
self.context.core.volume_get_buffer_info_host(self.id, ctypes.byref(buf), ctypes.byref(size))
else:
self.context.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
def get_tiles(self) -> array:
if self.id == 0:
raise RuntimeError("Invalid Volume")
buf = ctypes.c_void_p(0)
size = ctypes.c_uint64(0)
if self.device.is_cpu:
self.context.core.volume_get_tiles_host(self.id, ctypes.byref(buf), ctypes.byref(size))
else:
self.context.core.volume_get_tiles_device(self.id, ctypes.byref(buf), ctypes.byref(size))
num_tiles = size.value // (3 * 4)
return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, owner=True)
def get_voxel_size(self) -> Tuple[float, float, float]:
if self.id == 0:
raise RuntimeError("Invalid Volume")
dx, dy, dz = ctypes.c_float(0), ctypes.c_float(0), ctypes.c_float(0)
self.context.core.volume_get_voxel_size(self.id, ctypes.byref(dx), ctypes.byref(dy), ctypes.byref(dz))
return (dx.value, dy.value, dz.value)
@classmethod
def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
"""Creates a Volume object from a NanoVDB file or in-memory buffer.
Returns:
A ``warp.Volume`` object.
"""
try:
data = file_or_buffer.read()
except AttributeError:
data = file_or_buffer
magic, version, grid_count, codec = struct.unpack("<QIHH", data[0:16])
if magic != 0x304244566F6E614E:
raise RuntimeError("NanoVDB signature not found")
if version >> 21 != 32: # checking major version
raise RuntimeError("Unsupported NanoVDB version")
if grid_count != 1:
raise RuntimeError("Only NVDBs with exactly one grid are supported")
grid_data_offset = 192 + struct.unpack("<I", data[152:156])[0]
if codec == 0: # no compression
grid_data = data[grid_data_offset:]
elif codec == 1: # zip compression
grid_data = zlib.decompress(data[grid_data_offset + 8 :])
else:
raise RuntimeError(f"Unsupported codec code: {codec}")
magic = struct.unpack("<Q", grid_data[0:8])[0]
if magic != 0x304244566F6E614E:
raise RuntimeError("NanoVDB signature not found on grid!")
data_array = array(np.frombuffer(grid_data, dtype=np.byte), device=device)
return cls(data_array)
@classmethod
def load_from_numpy(
cls, ndarray: np.array, min_world=(0.0, 0.0, 0.0), voxel_size=1.0, bg_value=0.0, device=None
) -> Volume:
"""Creates a Volume object from a dense 3D NumPy array.
This function is only supported for CUDA devices.
Args:
min_world: The 3D coordinate of the lower corner of the volume.
voxel_size: The size of each voxel in spatial coordinates.
bg_value: Background value
device: The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
Returns:
A ``warp.Volume`` object.
"""
import math
target_shape = (
math.ceil(ndarray.shape[0] / 8) * 8,
math.ceil(ndarray.shape[1] / 8) * 8,
math.ceil(ndarray.shape[2] / 8) * 8,
)
if hasattr(bg_value, "__len__"):
# vec3, assuming the numpy array is 4D
padded_array = np.array((target_shape[0], target_shape[1], target_shape[2], 3), dtype=np.single)
padded_array[:, :, :, :] = np.array(bg_value)
padded_array[0 : ndarray.shape[0], 0 : ndarray.shape[1], 0 : ndarray.shape[2], :] = ndarray
else:
padded_amount = (
math.ceil(ndarray.shape[0] / 8) * 8 - ndarray.shape[0],
math.ceil(ndarray.shape[1] / 8) * 8 - ndarray.shape[1],
math.ceil(ndarray.shape[2] / 8) * 8 - ndarray.shape[2],
)
padded_array = np.pad(
ndarray,
((0, padded_amount[0]), (0, padded_amount[1]), (0, padded_amount[2])),
mode="constant",
constant_values=bg_value,
)
shape = padded_array.shape
volume = warp.Volume.allocate(
min_world,
[
min_world[0] + (shape[0] - 1) * voxel_size,
min_world[1] + (shape[1] - 1) * voxel_size,
min_world[2] + (shape[2] - 1) * voxel_size,
],
voxel_size,
bg_value=bg_value,
points_in_world_space=True,
translation=min_world,
device=device,
)
# Populate volume
if hasattr(bg_value, "__len__"):
warp.launch(
warp.utils.copy_dense_volume_to_nano_vdb_v,
dim=(shape[0], shape[1], shape[2]),
inputs=[volume.id, warp.array(padded_array, dtype=warp.vec3, device=device)],
device=device,
)
elif isinstance(bg_value, int):
warp.launch(
warp.utils.copy_dense_volume_to_nano_vdb_i,
dim=shape,
inputs=[volume.id, warp.array(padded_array, dtype=warp.int32, device=device)],
device=device,
)
else:
warp.launch(
warp.utils.copy_dense_volume_to_nano_vdb_f,
dim=shape,
inputs=[volume.id, warp.array(padded_array, dtype=warp.float32, device=device)],
device=device,
)
return volume
@classmethod
def allocate(
cls,
min: List[int],
max: List[int],
voxel_size: float,
bg_value=0.0,
translation=(0.0, 0.0, 0.0),
points_in_world_space=False,
device=None,
) -> Volume:
"""Allocate a new Volume based on the bounding box defined by min and max.
This function is only supported for CUDA devices.
Allocate a volume that is large enough to contain voxels [min[0], min[1], min[2]] - [max[0], max[1], max[2]], inclusive.
If points_in_world_space is true, then min and max are first converted to index space with the given voxel size and
translation, and the volume is allocated with those.
The smallest unit of allocation is a dense tile of 8x8x8 voxels, the requested bounding box is rounded up to tiles, and
the resulting tiles will be available in the new volume.
Args:
min (array-like): Lower 3D coordinates of the bounding box in index space or world space, inclusive.
max (array-like): Upper 3D coordinates of the bounding box in index space or world space, inclusive.
voxel_size (float): Voxel size of the new volume.
bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
translation (array-like): translation between the index and world spaces.
device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
"""
if points_in_world_space:
min = np.around((np.array(min, dtype=np.float32) - translation) / voxel_size)
max = np.around((np.array(max, dtype=np.float32) - translation) / voxel_size)
tile_min = np.array(min, dtype=np.int32) // 8
tile_max = np.array(max, dtype=np.int32) // 8
tiles = np.array(
[
[i, j, k]
for i in range(tile_min[0], tile_max[0] + 1)
for j in range(tile_min[1], tile_max[1] + 1)
for k in range(tile_min[2], tile_max[2] + 1)
],
dtype=np.int32,
)
tile_points = array(tiles * 8, device=device)
return cls.allocate_by_tiles(tile_points, voxel_size, bg_value, translation, device)
@classmethod
def allocate_by_tiles(
cls, tile_points: array, voxel_size: float, bg_value=0.0, translation=(0.0, 0.0, 0.0), device=None
) -> Volume:
"""Allocate a new Volume with active tiles for each point tile_points.
This function is only supported for CUDA devices.
The smallest unit of allocation is a dense tile of 8x8x8 voxels.
This is the primary method for allocating sparse volumes. It uses an array of points indicating the tiles that must be allocated.
Example use cases:
* `tile_points` can mark tiles directly in index space as in the case this method is called by `allocate`.
* `tile_points` can be a list of points used in a simulation that needs to transfer data to a volume.
Args:
tile_points (:class:`warp.array`): Array of positions that define the tiles to be allocated.
The array can be a 2D, N-by-3 array of :class:`warp.int32` values, indicating index space positions,
or can be a 1D array of :class:`warp.vec3` values, indicating world space positions.
Repeated points per tile are allowed and will be efficiently deduplicated.
voxel_size (float): Voxel size of the new volume.
bg_value (float or array-like): Value of unallocated voxels of the volume, also defines the volume's type, a :class:`warp.vec3` volume is created if this is `array-like`, otherwise a float volume is created
translation (array-like): Translation between the index and world spaces.
device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
"""
from warp.context import runtime
device = runtime.get_device(device)
if voxel_size <= 0.0:
raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
if not device.is_cuda:
raise RuntimeError("Only CUDA devices are supported for allocate_by_tiles")
if not (
isinstance(tile_points, array)
and (tile_points.dtype == int32 and tile_points.ndim == 2)
or (tile_points.dtype == vec3 and tile_points.ndim == 1)
):
raise RuntimeError("Expected an warp array of vec3s or of n-by-3 int32s as tile_points!")
if not tile_points.device.is_cuda:
tile_points = array(tile_points, dtype=tile_points.dtype, device=device)
volume = cls(data=None)
volume.device = device
in_world_space = tile_points.dtype == vec3
if hasattr(bg_value, "__len__"):
volume.id = volume.context.core.volume_v_from_tiles_device(
volume.device.context,
ctypes.c_void_p(tile_points.ptr),
tile_points.shape[0],
voxel_size,
bg_value[0],
bg_value[1],
bg_value[2],
translation[0],
translation[1],
translation[2],
in_world_space,
)
elif isinstance(bg_value, int):
volume.id = volume.context.core.volume_i_from_tiles_device(
volume.device.context,
ctypes.c_void_p(tile_points.ptr),
tile_points.shape[0],
voxel_size,
bg_value,
translation[0],
translation[1],
translation[2],
in_world_space,
)
else:
volume.id = volume.context.core.volume_f_from_tiles_device(
volume.device.context,
ctypes.c_void_p(tile_points.ptr),
tile_points.shape[0],
voxel_size,
float(bg_value),
translation[0],
translation[1],
translation[2],
in_world_space,
)
if volume.id == 0:
raise RuntimeError("Failed to create volume")
return volume
# definition just for kernel type (cannot be a parameter), see mesh.h
# NOTE: its layout must match the corresponding struct defined in C.
# NOTE: it needs to be defined after `indexedarray` to workaround a circular import issue.
class mesh_query_point_t:
"""Output for the mesh query point functions.
Attributes:
result (bool): Whether a point is found within the given constraints.
sign (float32): A value < 0 if query point is inside the mesh, >=0 otherwise.
Note that mesh must be watertight for this to be robust
face (int32): Index of the closest face.
u (float32): Barycentric u coordinate of the closest point.
v (float32): Barycentric v coordinate of the closest point.
See Also:
:func:`mesh_query_point`, :func:`mesh_query_point_no_sign`,
:func:`mesh_query_furthest_point_no_sign`,
:func:`mesh_query_point_sign_normal`,
and :func:`mesh_query_point_sign_winding_number`.
"""
from warp.codegen import Var
vars = {
"result": Var("result", bool),
"sign": Var("sign", float32),
"face": Var("face", int32),
"u": Var("u", float32),
"v": Var("v", float32),
}
# definition just for kernel type (cannot be a parameter), see mesh.h
# NOTE: its layout must match the corresponding struct defined in C.
class mesh_query_ray_t:
"""Output for the mesh query ray functions.
Attributes:
result (bool): Whether a hit is found within the given constraints.
sign (float32): A value > 0 if the ray hit in front of the face, returns < 0 otherwise.
face (int32): Index of the closest face.
t (float32): Distance of the closest hit along the ray.
u (float32): Barycentric u coordinate of the closest hit.
v (float32): Barycentric v coordinate of the closest hit.
normal (vec3f): Face normal.
See Also:
:func:`mesh_query_ray`.
"""
from warp.codegen import Var
vars = {
"result": Var("result", bool),
"sign": Var("sign", float32),
"face": Var("face", int32),
"t": Var("t", float32),
"u": Var("u", float32),
"v": Var("v", float32),
"normal": Var("normal", vec3),
}
def matmul(
a: array2d,
b: array2d,
c: array2d,
d: array2d,
alpha: float = 1.0,
beta: float = 0.0,
allow_tf32x3_arith: builtins.bool = False,
device=None,
):
"""Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
Args:
a (array2d): two-dimensional array containing matrix A
b (array2d): two-dimensional array containing matrix B
c (array2d): two-dimensional array containing matrix C
d (array2d): two-dimensional array to which output D is written
alpha (float): parameter alpha of GEMM
beta (float): parameter beta of GEMM
allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
while using Tensor Cores
device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
"""
from warp.context import runtime
if device is None:
device = runtime.get_device(device)
if a.device != device or b.device != device or c.device != device or d.device != device:
raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")
if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
raise RuntimeError(
"wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
)
if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
raise RuntimeError(
"wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
)
m = a.shape[0]
n = b.shape[1]
k = a.shape[1]
if b.shape != (k, n) or c.shape != (m, n) or d.shape != (m, n):
raise RuntimeError(
"Invalid shapes for matrices: A = {} B = {} C = {} D = {}".format(a.shape, b.shape, c.shape, d.shape)
)
if runtime.tape:
runtime.tape.record_func(
backward=lambda: adj_matmul(
a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
),
arrays=[a, b, c, d],
)
# cpu fallback if no cuda devices found
if device == "cpu":
d.assign(alpha * (a.numpy() @ b.numpy()) + beta * c.numpy())
return
cc = device.arch
ret = runtime.core.cutlass_gemm(
cc,
m,
n,
k,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(a.ptr),
ctypes.c_void_p(b.ptr),
ctypes.c_void_p(c.ptr),
ctypes.c_void_p(d.ptr),
alpha,
beta,
not a.is_transposed,
not b.is_transposed,
allow_tf32x3_arith,
1,
)
if not ret:
raise RuntimeError("matmul failed.")
def adj_matmul(
a: array2d,
b: array2d,
c: array2d,
adj_a: array2d,
adj_b: array2d,
adj_c: array2d,
adj_d: array2d,
alpha: float = 1.0,
beta: float = 0.0,
allow_tf32x3_arith: builtins.bool = False,
device=None,
):
"""Computes the adjoint of a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
note: the adjoint of parameter alpha is not included but can be computed as `adj_alpha = np.sum(np.concatenate(np.multiply(a @ b, adj_d)))`.
note: the adjoint of parameter beta is not included but can be computed as `adj_beta = np.sum(np.concatenate(np.multiply(c, adj_d)))`.
Args:
a (array2d): two-dimensional array containing matrix A
b (array2d): two-dimensional array containing matrix B
c (array2d): two-dimensional array containing matrix C
adj_a (array2d): two-dimensional array to which the adjoint of matrix A is written
adj_b (array2d): two-dimensional array to which the adjoint of matrix B is written
adj_c (array2d): two-dimensional array to which the adjoint of matrix C is written
adj_d (array2d): two-dimensional array containing the adjoint of matrix D
alpha (float): parameter alpha of GEMM
beta (float): parameter beta of GEMM
allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
while using Tensor Cores
device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
"""
from warp.context import runtime
if device is None:
device = runtime.get_device(device)
if (
a.device != device
or b.device != device
or c.device != device
or adj_a.device != device
or adj_b.device != device
or adj_c.device != device
or adj_d.device != device
):
raise RuntimeError(
"Matrices A, B, C, D, and their adjoints must all be on the same device as the runtime device."
)
if (
a.dtype != b.dtype
or a.dtype != c.dtype
or a.dtype != adj_a.dtype
or a.dtype != adj_b.dtype
or a.dtype != adj_c.dtype
or a.dtype != adj_d.dtype
):
raise RuntimeError(
"wp.adj_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
)
if (
(not a.is_contiguous and not a.is_transposed)
or (not b.is_contiguous and not b.is_transposed)
or (not c.is_contiguous)
or (not adj_a.is_contiguous and not adj_a.is_transposed)
or (not adj_b.is_contiguous and not adj_b.is_transposed)
or (not adj_c.is_contiguous)
or (not adj_d.is_contiguous)
):
raise RuntimeError(
"wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
)
m = a.shape[0]
n = b.shape[1]
k = a.shape[1]
if (
a.shape != (m, k)
or b.shape != (k, n)
or c.shape != (m, n)
or adj_d.shape != (m, n)
or adj_a.shape != (m, k)
or adj_b.shape != (k, n)
or adj_c.shape != (m, n)
):
raise RuntimeError(
"Invalid shapes for matrices: A = {} B = {} C = {} adj_D = {} adj_A = {} adj_B = {} adj_C = {}".format(
a.shape, b.shape, c.shape, adj_d.shape, adj_a.shape, adj_b.shape, adj_c.shape
)
)
# cpu fallback if no cuda devices found
if device == "cpu":
adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose()) + adj_a.numpy())
adj_b.assign(alpha * (a.numpy().transpose() @ adj_d.numpy()) + adj_b.numpy())
adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
return
cc = device.arch
# adj_a
if not a.is_transposed:
ret = runtime.core.cutlass_gemm(
cc,
m,
k,
n,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(adj_d.ptr),
ctypes.c_void_p(b.ptr),
ctypes.c_void_p(adj_a.ptr),
ctypes.c_void_p(adj_a.ptr),
alpha,
1.0,
True,
b.is_transposed,
allow_tf32x3_arith,
1,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
else:
ret = runtime.core.cutlass_gemm(
cc,
k,
m,
n,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(b.ptr),
ctypes.c_void_p(adj_d.ptr),
ctypes.c_void_p(adj_a.ptr),
ctypes.c_void_p(adj_a.ptr),
alpha,
1.0,
not b.is_transposed,
False,
allow_tf32x3_arith,
1,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
# adj_b
if not b.is_transposed:
ret = runtime.core.cutlass_gemm(
cc,
k,
n,
m,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(a.ptr),
ctypes.c_void_p(adj_d.ptr),
ctypes.c_void_p(adj_b.ptr),
ctypes.c_void_p(adj_b.ptr),
alpha,
1.0,
a.is_transposed,
True,
allow_tf32x3_arith,
1,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
else:
ret = runtime.core.cutlass_gemm(
cc,
n,
k,
m,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(adj_d.ptr),
ctypes.c_void_p(a.ptr),
ctypes.c_void_p(adj_b.ptr),
ctypes.c_void_p(adj_b.ptr),
alpha,
1.0,
False,
not a.is_transposed,
allow_tf32x3_arith,
1,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
# adj_c
warp.launch(
kernel=warp.utils.add_kernel_2d,
dim=adj_c.shape,
inputs=[adj_c, adj_d, adj_d.dtype(beta)],
device=device,
record_tape=False
)
def batched_matmul(
a: array3d,
b: array3d,
c: array3d,
d: array3d,
alpha: float = 1.0,
beta: float = 0.0,
allow_tf32x3_arith: builtins.bool = False,
device=None,
):
"""Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
Args:
a (array3d): three-dimensional array containing A matrices. Overall array dimension is {batch_count, M, K}
b (array3d): three-dimensional array containing B matrices. Overall array dimension is {batch_count, K, N}
c (array3d): three-dimensional array containing C matrices. Overall array dimension is {batch_count, M, N}
d (array3d): three-dimensional array to which output D is written. Overall array dimension is {batch_count, M, N}
alpha (float): parameter alpha of GEMM
beta (float): parameter beta of GEMM
allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
while using Tensor Cores
device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
"""
from warp.context import runtime
if device is None:
device = runtime.get_device(device)
if a.device != device or b.device != device or c.device != device or d.device != device:
raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")
if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
raise RuntimeError(
"wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
)
if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
raise RuntimeError(
"wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
)
m = a.shape[1]
n = b.shape[2]
k = a.shape[2]
batch_count = a.shape[0]
if b.shape != (batch_count, k, n) or c.shape != (batch_count, m, n) or d.shape != (batch_count, m, n):
raise RuntimeError(
"Invalid shapes for matrices: A = {} B = {} C = {} D = {}".format(a.shape, b.shape, c.shape, d.shape)
)
if runtime.tape:
runtime.tape.record_func(
backward=lambda: adj_batched_matmul(
a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
),
arrays=[a, b, c, d],
)
# cpu fallback if no cuda devices found
if device == "cpu":
d.assign(alpha * np.matmul(a.numpy(), b.numpy()) + beta * c.numpy())
return
# handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
max_batch_count = 65535
iters = int(batch_count / max_batch_count)
remainder = batch_count % max_batch_count
cc = device.arch
for i in range(iters):
idx_start = i * max_batch_count
idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
ret = runtime.core.cutlass_gemm(
cc,
m,
n,
k,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(c[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(d[idx_start:idx_end,:,:].ptr),
alpha,
beta,
not a.is_transposed,
not b.is_transposed,
allow_tf32x3_arith,
max_batch_count,
)
if not ret:
raise RuntimeError("Batched matmul failed.")
idx_start = iters * max_batch_count
ret = runtime.core.cutlass_gemm(
cc,
m,
n,
k,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(a[idx_start:,:,:].ptr),
ctypes.c_void_p(b[idx_start:,:,:].ptr),
ctypes.c_void_p(c[idx_start:,:,:].ptr),
ctypes.c_void_p(d[idx_start:,:,:].ptr),
alpha,
beta,
not a.is_transposed,
not b.is_transposed,
allow_tf32x3_arith,
remainder,
)
if not ret:
raise RuntimeError("Batched matmul failed.")
def adj_batched_matmul(
a: array3d,
b: array3d,
c: array3d,
adj_a: array3d,
adj_b: array3d,
adj_c: array3d,
adj_d: array3d,
alpha: float = 1.0,
beta: float = 0.0,
allow_tf32x3_arith: builtins.bool = False,
device=None,
):
"""Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
Args:
a (array3d): three-dimensional array containing A matrices. Overall array dimension is {batch_count, M, K}
b (array3d): three-dimensional array containing B matrices. Overall array dimension is {batch_count, K, N}
c (array3d): three-dimensional array containing C matrices. Overall array dimension is {batch_count, M, N}
adj_a (array3d): three-dimensional array to which the adjoints of A matrices are written. Overall array dimension is {batch_count, M, K}
adj_b (array3d): three-dimensional array to which the adjoints of B matrices are written. Overall array dimension is {batch_count, K, N}
adj_c (array3d): three-dimensional array to which the adjoints of C matrices are written. Overall array dimension is {batch_count, M, N}
adj_d (array3d): three-dimensional array containing adjoints of D matrices. Overall array dimension is {batch_count, M, N}
alpha (float): parameter alpha of GEMM
beta (float): parameter beta of GEMM
allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
while using Tensor Cores
device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
"""
from warp.context import runtime
if device is None:
device = runtime.get_device(device)
if (
a.device != device
or b.device != device
or c.device != device
or adj_a.device != device
or adj_b.device != device
or adj_c.device != device
or adj_d.device != device
):
raise RuntimeError(
"Matrices A, B, C, D, and their adjoints must all be on the same device as the runtime device."
)
if (
a.dtype != b.dtype
or a.dtype != c.dtype
or a.dtype != adj_a.dtype
or a.dtype != adj_b.dtype
or a.dtype != adj_c.dtype
or a.dtype != adj_d.dtype
):
raise RuntimeError(
"wp.adj_batched_matmul currently only supports operation between {A, B, C, adj_D, adj_A, adj_B, adj_C} matrices of the same type."
)
m = a.shape[1]
n = b.shape[2]
k = a.shape[2]
batch_count = a.shape[0]
if (
b.shape != (batch_count, k, n)
or c.shape != (batch_count, m, n)
or adj_d.shape != (batch_count, m, n)
or adj_a.shape != (batch_count, m, k)
or adj_b.shape != (batch_count, k, n)
or adj_c.shape != (batch_count, m, n)
):
raise RuntimeError(
"Invalid shapes for matrices: A = {} B = {} C = {} adj_D = {} adj_A = {} adj_B = {} adj_C = {}".format(
a.shape, b.shape, c.shape, adj_d.shape, adj_a.shape, adj_b.shape, adj_c.shape
)
)
if (
(not a.is_contiguous and not a.is_transposed)
or (not b.is_contiguous and not b.is_transposed)
or (not c.is_contiguous)
or (not adj_a.is_contiguous and not adj_a.is_transposed)
or (not adj_b.is_contiguous and not adj_b.is_transposed)
or (not adj_c.is_contiguous)
or (not adj_d.is_contiguous)
):
raise RuntimeError(
"wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
)
# cpu fallback if no cuda devices found
if device == "cpu":
adj_a.assign(alpha * np.matmul(adj_d.numpy(), b.numpy().transpose((0, 2, 1))) + adj_a.numpy())
adj_b.assign(alpha * np.matmul(a.numpy().transpose((0, 2, 1)), adj_d.numpy()) + adj_b.numpy())
adj_c.assign(beta * adj_d.numpy() + adj_c.numpy())
return
# handle case in which batch_count exceeds max_batch_count, which is a CUDA array size maximum
max_batch_count = 65535
iters = int(batch_count / max_batch_count)
remainder = batch_count % max_batch_count
cc = device.arch
for i in range(iters):
idx_start = i * max_batch_count
idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
# adj_a
if not a.is_transposed:
ret = runtime.core.cutlass_gemm(
cc,
m,
k,
n,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
alpha,
1.0,
True,
b.is_transposed,
allow_tf32x3_arith,
max_batch_count,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
else:
ret = runtime.core.cutlass_gemm(
cc,
k,
m,
n,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
alpha,
1.0,
not b.is_transposed,
False,
allow_tf32x3_arith,
max_batch_count,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
# adj_b
if not b.is_transposed:
ret = runtime.core.cutlass_gemm(
cc,
k,
n,
m,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
alpha,
1.0,
a.is_transposed,
True,
allow_tf32x3_arith,
max_batch_count,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
else:
ret = runtime.core.cutlass_gemm(
cc,
n,
k,
m,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
alpha,
1.0,
False,
not a.is_transposed,
allow_tf32x3_arith,
max_batch_count,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
idx_start = iters * max_batch_count
# adj_a
if not a.is_transposed:
ret = runtime.core.cutlass_gemm(
cc,
m,
k,
n,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
ctypes.c_void_p(b[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
alpha,
1.0,
True,
b.is_transposed,
allow_tf32x3_arith,
remainder,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
else:
ret = runtime.core.cutlass_gemm(
cc,
k,
m,
n,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(b[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
alpha,
1.0,
not b.is_transposed,
False,
allow_tf32x3_arith,
remainder,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
# adj_b
if not b.is_transposed:
ret = runtime.core.cutlass_gemm(
cc,
k,
n,
m,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(a[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
alpha,
1.0,
a.is_transposed,
True,
allow_tf32x3_arith,
remainder,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
else:
ret = runtime.core.cutlass_gemm(
cc,
n,
k,
m,
type_typestr(a.dtype).encode(),
ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
ctypes.c_void_p(a[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
alpha,
1.0,
False,
not a.is_transposed,
allow_tf32x3_arith,
remainder,
)
if not ret:
raise RuntimeError("adj_matmul failed.")
# adj_c
warp.launch(
kernel=warp.utils.add_kernel_3d,
dim=adj_c.shape,
inputs=[adj_c, adj_d, adj_d.dtype(beta)],
device=device,
record_tape=False
)
class HashGrid:
def __init__(self, dim_x, dim_y, dim_z, device=None):
"""Class representing a hash grid object for accelerated point queries.
Attributes:
id: Unique identifier for this mesh object, can be passed to kernels.
device: Device this object lives on, all buffers must live on the same device.
Args:
dim_x (int): Number of cells in x-axis
dim_y (int): Number of cells in y-axis
dim_z (int): Number of cells in z-axis
"""
from warp.context import runtime
self.device = runtime.get_device(device)
if self.device.is_cpu:
self.id = runtime.core.hash_grid_create_host(dim_x, dim_y, dim_z)
else:
self.id = runtime.core.hash_grid_create_device(self.device.context, dim_x, dim_y, dim_z)
# indicates whether the grid data has been reserved for use by a kernel
self.reserved = False
def build(self, points, radius):
"""Updates the hash grid data structure.
This method rebuilds the underlying datastructure and should be called any time the set
of points changes.
Args:
points (:class:`warp.array`): Array of points of type :class:`warp.vec3`
radius (float): The cell size to use for bucketing points, cells are cubes with edges of this width.
For best performance the radius used to construct the grid should match closely to
the radius used when performing queries.
"""
from warp.context import runtime
if self.device.is_cpu:
runtime.core.hash_grid_update_host(self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points))
else:
runtime.core.hash_grid_update_device(self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points))
self.reserved = True
def reserve(self, num_points):
from warp.context import runtime
if self.device.is_cpu:
runtime.core.hash_grid_reserve_host(self.id, num_points)
else:
runtime.core.hash_grid_reserve_device(self.id, num_points)
self.reserved = True
def __del__(self):
try:
from warp.context import runtime
if self.device.is_cpu:
runtime.core.hash_grid_destroy_host(self.id)
else:
# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
runtime.core.hash_grid_destroy_device(self.id)
except Exception:
pass
class MarchingCubes:
def __init__(self, nx: int, ny: int, nz: int, max_verts: int, max_tris: int, device=None):
from warp.context import runtime
self.device = runtime.get_device(device)
if not self.device.is_cuda:
raise RuntimeError("Only CUDA devices are supported for marching cubes")
self.nx = nx
self.ny = ny
self.nz = nz
self.max_verts = max_verts
self.max_tris = max_tris
# bindings to warp.so
self.alloc = runtime.core.marching_cubes_create_device
self.alloc.argtypes = [ctypes.c_void_p]
self.alloc.restype = ctypes.c_uint64
self.free = runtime.core.marching_cubes_destroy_device
from warp.context import zeros
self.verts = zeros(max_verts, dtype=vec3, device=self.device)
self.indices = zeros(max_tris * 3, dtype=int, device=self.device)
# alloc surfacer
self.id = ctypes.c_uint64(self.alloc(self.device.context))
def __del__(self):
# use CUDA context guard to avoid side effects during garbage collection
with self.device.context_guard:
# destroy surfacer
self.free(self.id)
def resize(self, nx: int, ny: int, nz: int, max_verts: int, max_tris: int):
# actual allocations will be resized on next call to surface()
self.nx = nx
self.ny = ny
self.nz = nz
self.max_verts = max_verts
self.max_tris = max_tris
def surface(self, field: array(dtype=float), threshold: float):
from warp.context import runtime
# WP_API int marching_cubes_surface_host(const float* field, int nx, int ny, int nz, float threshold, wp::vec3* verts, int* triangles, int max_verts, int max_tris, int* out_num_verts, int* out_num_tris);
num_verts = ctypes.c_int(0)
num_tris = ctypes.c_int(0)
runtime.core.marching_cubes_surface_device.restype = ctypes.c_int
error = runtime.core.marching_cubes_surface_device(
self.id,
ctypes.cast(field.ptr, ctypes.c_void_p),
self.nx,
self.ny,
self.nz,
ctypes.c_float(threshold),
ctypes.cast(self.verts.ptr, ctypes.c_void_p),
ctypes.cast(self.indices.ptr, ctypes.c_void_p),
self.max_verts,
self.max_tris,
ctypes.c_void_p(ctypes.addressof(num_verts)),
ctypes.c_void_p(ctypes.addressof(num_tris)),
)
if error:
raise RuntimeError(
"Buffers may not be large enough, marching cubes required at least {num_verts} vertices, and {num_tris} triangles."
)
# resize the geometry arrays
self.verts.shape = (num_verts.value,)
self.indices.shape = (num_tris.value * 3,)
self.verts.size = num_verts.value
self.indices.size = num_tris.value * 3
def type_is_generic(t):
if t in (Any, Scalar, Float, Int):
return True
elif is_array(t):
return type_is_generic(t.dtype)
elif hasattr(t, "_wp_scalar_type_"):
# vector/matrix type, check if dtype is generic
if type_is_generic(t._wp_scalar_type_):
return True
# check if any dimension is generic
for d in t._shape_:
if d == 0:
return True
else:
return False
def type_is_generic_scalar(t):
return t in (Scalar, Float, Int)
def type_matches_template(arg_type, template_type):
"""Check if an argument type matches a template.
This function is used to test whether the arguments passed to a generic @wp.kernel or @wp.func
match the template type annotations. The template_type can be generic, but the arg_type must be concrete.
"""
# canonicalize types
arg_type = type_to_warp(arg_type)
template_type = type_to_warp(template_type)
# arg type must be concrete
if type_is_generic(arg_type):
return False
# if template type is not generic, the argument type must match exactly
if not type_is_generic(template_type):
return types_equal(arg_type, template_type)
# template type is generic, check that the argument type matches
if template_type == Any:
return True
elif is_array(template_type):
# ensure the argument type is a non-generic array with matching dtype and dimensionality
if type(arg_type) is not type(template_type):
return False
if not type_matches_template(arg_type.dtype, template_type.dtype):
return False
if arg_type.ndim != template_type.ndim:
return False
elif template_type == Float:
return arg_type in float_types
elif template_type == Int:
return arg_type in int_types
elif template_type == Scalar:
return arg_type in scalar_types
elif hasattr(template_type, "_wp_scalar_type_"):
# vector/matrix type
if not hasattr(arg_type, "_wp_scalar_type_"):
return False
if not type_matches_template(arg_type._wp_scalar_type_, template_type._wp_scalar_type_):
return False
ndim = len(template_type._shape_)
if len(arg_type._shape_) != ndim:
return False
# for any non-generic dimensions, make sure they match
for i in range(ndim):
if template_type._shape_[i] != 0 and arg_type._shape_[i] != template_type._shape_[i]:
return False
return True
def infer_argument_types(args, template_types, arg_names=None):
"""Resolve argument types with the given list of template types."""
if len(args) != len(template_types):
raise RuntimeError("Number of arguments must match number of template types.")
arg_types = []
for i in range(len(args)):
arg = args[i]
arg_type = type(arg)
arg_name = arg_names[i] if arg_names else str(i)
if arg_type in warp.types.array_types:
arg_types.append(arg_type(dtype=arg.dtype, ndim=arg.ndim))
elif arg_type in warp.types.scalar_types:
arg_types.append(arg_type)
elif arg_type in [int, float]:
# canonicalize type
arg_types.append(warp.types.type_to_warp(arg_type))
elif hasattr(arg_type, "_wp_scalar_type_"):
# vector/matrix type
arg_types.append(arg_type)
elif issubclass(arg_type, warp.codegen.StructInstance):
# a struct
arg_types.append(arg._cls)
# elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
# arg_types.append(arg_type)
# elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.mesh_query_point_t, warp.mesh_query_ray_t, warp.bvh_query_t]:
# arg_types.append(arg_type)
elif arg is None:
# allow passing None for arrays
t = template_types[i]
if warp.types.is_array(t):
arg_types.append(type(t)(dtype=t.dtype, ndim=t.ndim))
else:
raise TypeError(f"Unable to infer the type of argument '{arg_name}', got None")
else:
# TODO: attempt to figure out if it's a vector/matrix type given as a numpy array, list, etc.
raise TypeError(f"Unable to infer the type of argument '{arg_name}', got {arg_type}")
return arg_types
simple_type_codes = {
int: "i4",
float: "f4",
builtins.bool: "b",
bool: "b",
str: "str", # accepted by print()
int8: "i1",
int16: "i2",
int32: "i4",
int64: "i8",
uint8: "u1",
uint16: "u2",
uint32: "u4",
uint64: "u8",
float16: "f2",
float32: "f4",
float64: "f8",
shape_t: "sh",
range_t: "rg",
launch_bounds_t: "lb",
hash_grid_query_t: "hgq",
mesh_query_aabb_t: "mqa",
mesh_query_point_t: "mqp",
mesh_query_ray_t: "mqr",
bvh_query_t: "bvhq",
}
def get_type_code(arg_type):
if arg_type == Any:
# special case for generics
# note: since Python 3.11 Any is a type, so we check for it first
return "?"
elif isinstance(arg_type, type):
if hasattr(arg_type, "_wp_scalar_type_"):
# vector/matrix type
dtype_code = get_type_code(arg_type._wp_scalar_type_)
# check for "special" vector/matrix subtypes
if hasattr(arg_type, "_wp_generic_type_str_"):
type_str = arg_type._wp_generic_type_str_
if type_str == "quat_t":
return f"q{dtype_code}"
elif type_str == "transform_t":
return f"t{dtype_code}"
# elif type_str == "spatial_vector_t":
# return f"sv{dtype_code}"
# elif type_str == "spatial_matrix_t":
# return f"sm{dtype_code}"
# generic vector/matrix
ndim = len(arg_type._shape_)
if ndim == 1:
dim_code = "?" if arg_type._shape_[0] == 0 else str(arg_type._shape_[0])
return f"v{dim_code}{dtype_code}"
elif ndim == 2:
dim_code0 = "?" if arg_type._shape_[0] == 0 else str(arg_type._shape_[0])
dim_code1 = "?" if arg_type._shape_[1] == 0 else str(arg_type._shape_[1])
return f"m{dim_code0}{dim_code1}{dtype_code}"
else:
raise TypeError("Invalid vector/matrix dimensionality")
else:
# simple type
type_code = simple_type_codes.get(arg_type)
if type_code is not None:
return type_code
else:
raise TypeError(f"Unrecognized type '{arg_type}'")
elif isinstance(arg_type, array):
return f"a{arg_type.ndim}{get_type_code(arg_type.dtype)}"
elif isinstance(arg_type, indexedarray):
return f"ia{arg_type.ndim}{get_type_code(arg_type.dtype)}"
elif isinstance(arg_type, fabricarray):
return f"fa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
elif isinstance(arg_type, indexedfabricarray):
return f"ifa{arg_type.ndim}{get_type_code(arg_type.dtype)}"
elif isinstance(arg_type, warp.codegen.Struct):
return warp.codegen.make_full_qualified_name(arg_type.cls)
elif arg_type == Scalar:
# generic scalar type
return "s?"
elif arg_type == Float:
# generic float
return "f?"
elif arg_type == Int:
# generic int
return "i?"
elif isinstance(arg_type, Callable):
# TODO: elaborate on Callable type?
return "c"
else:
raise TypeError(f"Unrecognized type '{arg_type}'")
def get_signature(arg_types, func_name=None, arg_names=None):
type_codes = []
for i, arg_type in enumerate(arg_types):
try:
type_codes.append(get_type_code(arg_type))
except Exception as e:
if arg_names is not None:
arg_str = f"'{arg_names[i]}'"
else:
arg_str = str(i + 1)
if func_name is not None:
func_str = f" of function {func_name}"
else:
func_str = ""
raise RuntimeError(f"Failed to determine type code for argument {arg_str}{func_str}: {e}")
return "_".join(type_codes)
def is_generic_signature(sig):
return "?" in sig