// Copyright (C) 2017 Davis E. King (davis@dlib.net) | |
// License: Boost Software License See LICENSE.txt for the full license. | |
namespace dlib | |
{ | |
namespace cuda | |
{ | |
// ------------------------------------------------------------------------------------ | |
class cuda_data_void_ptr; | |
class weak_cuda_data_void_ptr | |
{ | |
/*! | |
WHAT THIS OBJECT REPRESENTS | |
This is just like a std::weak_ptr version of cuda_data_void_ptr. It allows you | |
to hold a non-owning reference to a cuda_data_void_ptr. | |
!*/ | |
public: | |
weak_cuda_data_void_ptr() = default; | |
weak_cuda_data_void_ptr(const cuda_data_void_ptr& ptr); | |
void reset() { pdata.reset(); num = 0; } | |
cuda_data_void_ptr lock() const; | |
/*! | |
ensures | |
- if (the memory block referenced by this object hasn't been deleted) then | |
- returns a cuda_data_void_ptr referencing that memory block | |
- else | |
- returns a default initialized cuda_data_void_ptr (i.e. an empty one). | |
!*/ | |
private: | |
size_t num = 0; | |
std::weak_ptr<void> pdata; | |
}; | |
// ---------------------------------------------------------------------------------------- | |
class cuda_data_void_ptr | |
{ | |
/*! | |
WHAT THIS OBJECT REPRESENTS | |
This is a block of memory on a CUDA device. | |
!*/ | |
public: | |
cuda_data_void_ptr() = default; | |
cuda_data_void_ptr(size_t n); | |
/*! | |
ensures | |
- This object will allocate a device memory buffer of n bytes. | |
- #size() == n | |
!*/ | |
void* data() { return pdata.get(); } | |
const void* data() const { return pdata.get(); } | |
operator void*() { return pdata.get(); } | |
operator const void*() const { return pdata.get(); } | |
void reset() { pdata.reset(); } | |
size_t size() const { return num; } | |
/*! | |
ensures | |
- returns the length of this buffer, in bytes. | |
!*/ | |
cuda_data_void_ptr operator+ (size_t offset) const | |
/*! | |
requires | |
- offset < size() | |
ensures | |
- returns a pointer that is offset by the given amount. | |
!*/ | |
{ | |
DLIB_CASSERT(offset < num); | |
cuda_data_void_ptr temp; | |
temp.num = num-offset; | |
temp.pdata = std::shared_ptr<void>(pdata, ((char*)pdata.get())+offset); | |
return temp; | |
} | |
void shrink(size_t new_size) | |
/*! | |
requires | |
- new_size <= num | |
ensures | |
- #size() == new_size | |
- Doesn't actually deallocate anything, just changes the size() metadata to a | |
smaller number and only for this instance of the pointer. | |
!*/ | |
{ | |
DLIB_CASSERT(new_size <= num); | |
num = new_size; | |
} | |
private: | |
friend class weak_cuda_data_void_ptr; | |
size_t num = 0; | |
std::shared_ptr<void> pdata; | |
}; | |
inline cuda_data_void_ptr operator+(size_t offset, const cuda_data_void_ptr& rhs) { return rhs+offset; } | |
// ------------------------------------------------------------------------------------ | |
void memcpy( | |
void* dest, | |
const cuda_data_void_ptr& src | |
); | |
/*! | |
requires | |
- dest == a pointer to at least src.size() bytes on the host machine. | |
ensures | |
- copies the GPU data from src into dest. | |
- This routine is equivalent to performing: memcpy(dest,src,src.size()) | |
!*/ | |
void memcpy( | |
void* dest, | |
const cuda_data_void_ptr& src, | |
const size_t num | |
); | |
/*! | |
requires | |
- dest == a pointer to at least num bytes on the host machine. | |
- num <= src.size() | |
ensures | |
- copies the GPU data from src into dest. Copies only the first num bytes | |
of src to dest. | |
!*/ | |
// ------------------------------------------------------------------------------------ | |
void memcpy( | |
cuda_data_void_ptr dest, | |
const void* src | |
); | |
/*! | |
requires | |
- dest == a pointer to at least src.size() bytes on the host machine. | |
ensures | |
- copies the host data from src to the GPU memory buffer dest. | |
- This routine is equivalent to performing: memcpy(dest,src,dest.size()) | |
!*/ | |
void memcpy( | |
cuda_data_void_ptr dest, | |
const void* src, | |
const size_t num | |
); | |
/*! | |
requires | |
- dest == a pointer to at least num bytes on the host machine. | |
- num <= dest.size() | |
ensures | |
- copies the host data from src to the GPU memory buffer dest. Copies only | |
the first num bytes of src to dest. | |
!*/ | |
// ------------------------------------------------------------------------------------ | |
// ------------------------------------------------------------------------------------ | |
// ------------------------------------------------------------------------------------ | |
template <typename T> | |
class cuda_data_ptr | |
{ | |
/*! | |
WHAT THIS OBJECT REPRESENTS | |
This is a block of memory on a CUDA device. It is just a type safe | |
version of cuda_data_void_ptr. | |
!*/ | |
public: | |
static_assert(std::is_standard_layout<T>::value, "You can only create basic standard layout types on the GPU"); | |
cuda_data_ptr() = default; | |
cuda_data_ptr(size_t n) : num(n) | |
/*! | |
ensures | |
- This object will allocate a device memory buffer of n T objects. | |
- #size() == n | |
!*/ | |
{ | |
if (n == 0) | |
return; | |
pdata = cuda_data_void_ptr(n*sizeof(T)); | |
} | |
cuda_data_ptr( | |
const cuda_data_ptr<typename std::remove_const<T>::type> &other | |
) : num(other.num), pdata(other.pdata) {} | |
/*! | |
ensures | |
- *this is a copy of other. This version of the copy constructor allows | |
assigning non-const pointers to const ones. For instance, converting from | |
cuda_data_ptr<float> to cuda_data_ptr<const float>. | |
!*/ | |
T* data() { return (T*)pdata.data(); } | |
const T* data() const { return (T*)pdata.data(); } | |
operator T*() { return (T*)pdata.data(); } | |
operator const T*() const { return (T*)pdata.data(); } | |
void reset() { pdata.reset(); } | |
size_t size() const { return num; } | |
/*! | |
ensures | |
- returns the number of T instances pointed to by *this. | |
!*/ | |
operator cuda_data_void_ptr() const | |
/*! | |
ensures | |
- returns *this as a cuda_data_void_ptr. Importantly, the returned size() will | |
reflect the number of bytes referenced by *this. To be clear, let P be the | |
returned pointer. Then: | |
- P.get() == get() | |
- P.size() == size() * sizeof(T) | |
!*/ | |
{ | |
cuda_data_void_ptr temp = pdata; | |
temp.shrink(size() * sizeof(T)); | |
return temp; | |
} | |
private: | |
template <typename U> | |
friend cuda_data_ptr<U> static_pointer_cast(const cuda_data_void_ptr &ptr); | |
template <typename U> | |
friend cuda_data_ptr<U> static_pointer_cast(const cuda_data_void_ptr &ptr, size_t num); | |
template <typename U> | |
friend class cuda_data_ptr; | |
size_t num = 0; | |
cuda_data_void_ptr pdata; | |
}; | |
template <typename T> | |
cuda_data_ptr<T> static_pointer_cast(const cuda_data_void_ptr &ptr) | |
{ | |
DLIB_CASSERT(ptr.size() % sizeof(T) == 0, | |
"Size of memory buffer in ptr doesn't match sizeof(T). " | |
<< "\nptr.size(): "<< ptr.size() | |
<< "\nsizeof(T): "<< sizeof(T)); | |
cuda_data_ptr<T> result; | |
result.pdata = ptr; | |
result.num = ptr.size() / sizeof(T); | |
return result; | |
} | |
template <typename T> | |
cuda_data_ptr<T> static_pointer_cast(const cuda_data_void_ptr &ptr, size_t num) | |
{ | |
DLIB_CASSERT(num*sizeof(T) <= ptr.size(), | |
"Size of memory buffer in ptr isn't big enough to represent this many T objects. " | |
<< "\nnum: "<< num | |
<< "\nnum*sizeof(T): "<< num*sizeof(T) | |
<< "\nsizeof(T): "<< sizeof(T) | |
<< "\nptr.size(): "<< ptr.size()); | |
cuda_data_ptr<T> result; | |
result.pdata = ptr; | |
result.num = num; | |
return result; | |
} | |
template <typename T> | |
void memcpy(std::vector<T>& dest, const cuda_data_ptr<T>& src) | |
{ | |
dest.resize(src.size()); | |
if (src.size() != 0) | |
memcpy(dest.data(), static_cast<cuda_data_void_ptr>(src)); | |
} | |
template <typename T> | |
void memcpy(cuda_data_ptr<T>& dest, const std::vector<T>& src) | |
{ | |
if (src.size() != dest.size()) | |
dest = cuda_data_ptr<T>(src.size()); | |
if (dest.size() != 0) | |
memcpy(static_cast<cuda_data_void_ptr>(dest), src.data()); | |
} | |
template <typename T> | |
void memcpy(cuda_data_ptr<T>& dest, const T* src) | |
{ | |
memcpy(static_cast<cuda_data_void_ptr>(dest), src); | |
} | |
template <typename T> | |
void memcpy(cuda_data_ptr<T>& dest, const T* src, size_t num) | |
{ | |
DLIB_CASSERT(num <= dest.size()); | |
memcpy(static_cast<cuda_data_void_ptr>(dest), src, num*sizeof(T)); | |
} | |
template <typename T> | |
void memcpy(T* dest, const cuda_data_ptr<T>& src) | |
{ | |
memcpy(dest, static_cast<cuda_data_void_ptr>(src)); | |
} | |
template <typename T> | |
void memcpy(T* dest, const cuda_data_ptr<T>& src, size_t num) | |
{ | |
DLIB_CASSERT(num <= src.size()); | |
memcpy(dest, static_cast<cuda_data_void_ptr>(src), num*sizeof(T)); | |
} | |
// ------------------------------------------------------------------------------------ | |
cuda_data_void_ptr device_global_buffer(size_t size); | |
/*! | |
ensures | |
- Returns a pointer to a globally shared CUDA memory buffer on the | |
currently selected CUDA device. The buffer is also thread local. So | |
each host thread will get its own buffer. You can use this global buffer | |
as scratch space for CUDA computations that all take place on the default | |
stream. Using it in this way ensures that there aren't any race conditions | |
involving the use of the buffer. | |
- The returned pointer will point to at least size bytes. It may point to more. | |
- The global buffer is deallocated once all references to it are destructed. | |
However, if device_global_buffer() is called before then with a size <= the last | |
size requested, then the previously returned global buffer pointer is returned. | |
This avoids triggering expensive CUDA reallocations. So if you want to avoid | |
these reallocations then hold a copy of the pointer returned by this function. | |
However, as a general rule, client code should not hold the returned | |
cuda_data_void_ptr for long durations, but instead should call | |
device_global_buffer() whenever the buffer is needed, and overwrite the previously | |
returned pointer with the new pointer. Doing so ensures multiple buffers are not | |
kept around in the event that multiple sized buffers are requested. To explain | |
this, consider this code, assumed to execute at program startup: | |
auto ptr1 = device_global_buffer(1); | |
auto ptr2 = device_global_buffer(2); | |
auto ptr3 = device_global_buffer(3); | |
since the sizes increased at each call 3 separate buffers were allocated. First | |
one of size 1, then of size 2, then of size 3. If we then executed: | |
ptr1 = device_global_buffer(1); | |
ptr2 = device_global_buffer(2); | |
ptr3 = device_global_buffer(3); | |
all three of these pointers would now point to the same buffer, since the smaller | |
requests can be satisfied by returning the size 3 buffer in each case. | |
!*/ | |
// ---------------------------------------------------------------------------------------- | |
} | |
} | |