|
#ifndef OPENPOSE_PRIVATE_GPU_CUDA_HU |
|
#define OPENPOSE_PRIVATE_GPU_CUDA_HU |
|
|
|
|
|
|
|
#include <cuda.h> |
|
#include <cuda_runtime.h> |
|
|
|
namespace op |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
template<typename T> |
|
inline __device__ char positiveCharRoundCuda(const T a) |
|
{ |
|
return char(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ signed char positiveSCharRoundCuda(const T a) |
|
{ |
|
return (signed char)(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ int positiveIntRoundCuda(const T a) |
|
{ |
|
return int(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ long positiveLongRoundCuda(const T a) |
|
{ |
|
return long(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ long long positiveLongLongRoundCuda(const T a) |
|
{ |
|
return (long long)(a+0.5f); |
|
} |
|
|
|
|
|
template<typename T> |
|
inline __device__ unsigned char uCharRoundCuda(const T a) |
|
{ |
|
return (unsigned char)(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ unsigned int uIntRoundCuda(const T a) |
|
{ |
|
return (unsigned int)(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ unsigned long ulongRoundCuda(const T a) |
|
{ |
|
return (unsigned long)(a+0.5f); |
|
} |
|
|
|
template<typename T> |
|
inline __device__ unsigned long long uLongLongRoundCuda(const T a) |
|
{ |
|
return (unsigned long long)(a+0.5f); |
|
} |
|
|
|
|
|
template<class T> |
|
inline __device__ T fastMaxCuda(const T a, const T b) |
|
{ |
|
return (a > b ? a : b); |
|
} |
|
|
|
template<class T> |
|
inline __device__ T fastMinCuda(const T a, const T b) |
|
{ |
|
return (a < b ? a : b); |
|
} |
|
|
|
template<class T> |
|
inline __device__ T fastTruncateCuda(const T value, const T min = 0, const T max = 1) |
|
{ |
|
return fastMinCuda(max, fastMaxCuda(min, value)); |
|
} |
|
|
|
|
|
template <typename T> |
|
inline __device__ void cubicSequentialData( |
|
int* xIntArray, int* yIntArray, T& dx, T& dy, const T xSource, const T ySource, const int widthSource, |
|
const int heightSource) |
|
{ |
|
xIntArray[1] = fastTruncateCuda(int(floor(xSource)), 0, widthSource - 1); |
|
xIntArray[0] = fastMaxCuda(0, xIntArray[1] - 1); |
|
xIntArray[2] = fastMinCuda(widthSource - 1, xIntArray[1] + 1); |
|
xIntArray[3] = fastMinCuda(widthSource - 1, xIntArray[2] + 1); |
|
dx = xSource - xIntArray[1]; |
|
|
|
yIntArray[1] = fastTruncateCuda(int(floor(ySource)), 0, heightSource - 1); |
|
yIntArray[0] = fastMaxCuda(0, yIntArray[1] - 1); |
|
yIntArray[2] = fastMinCuda(heightSource - 1, yIntArray[1] + 1); |
|
yIntArray[3] = fastMinCuda(heightSource - 1, yIntArray[2] + 1); |
|
dy = ySource - yIntArray[1]; |
|
} |
|
|
|
template <typename T> |
|
inline __device__ T cubicInterpolate(const T v0, const T v1, const T v2, const T v3, const T dx) |
|
{ |
|
|
|
|
|
|
|
|
|
|
|
return (-0.5f * v0 + 1.5f * v1 - 1.5f * v2 + 0.5f * v3) * dx * dx * dx |
|
+ (v0 - 2.5f * v1 + 2.f * v2 - 0.5f * v3) * dx * dx |
|
- 0.5f * (v0 - v2) * dx |
|
+ v1; |
|
|
|
} |
|
|
|
template <typename T> |
|
inline __device__ T bicubicInterpolate( |
|
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource, |
|
const int widthSourcePtr) |
|
{ |
|
int xIntArray[4]; |
|
int yIntArray[4]; |
|
T dx; |
|
T dy; |
|
cubicSequentialData(xIntArray, yIntArray, dx, dy, xSource, ySource, widthSource, heightSource); |
|
|
|
T temp[4]; |
|
for (unsigned char i = 0; i < 4; i++) |
|
{ |
|
const auto offset = yIntArray[i]*widthSourcePtr; |
|
temp[i] = cubicInterpolate( |
|
sourcePtr[offset + xIntArray[0]], sourcePtr[offset + xIntArray[1]], sourcePtr[offset + xIntArray[2]], |
|
sourcePtr[offset + xIntArray[3]], dx); |
|
} |
|
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy); |
|
} |
|
|
|
template <typename T> |
|
inline __device__ T bicubicInterpolate( |
|
const unsigned char* const sourcePtr, const T xSource, const T ySource, const int widthSource, |
|
const int heightSource, const int widthSourcePtr) |
|
{ |
|
int xIntArray[4]; |
|
int yIntArray[4]; |
|
T dx; |
|
T dy; |
|
cubicSequentialData(xIntArray, yIntArray, dx, dy, xSource, ySource, widthSource, heightSource); |
|
|
|
T temp[4]; |
|
for (unsigned char i = 0; i < 4; i++) |
|
{ |
|
const auto offset = yIntArray[i]*widthSourcePtr; |
|
temp[i] = cubicInterpolate( |
|
T(sourcePtr[offset + xIntArray[0]]), T(sourcePtr[offset + xIntArray[1]]), |
|
T(sourcePtr[offset + xIntArray[2]]), T(sourcePtr[offset + xIntArray[3]]), dx); |
|
} |
|
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy); |
|
} |
|
|
|
template <typename T> |
|
inline __device__ T bicubicInterpolate8Times( |
|
const T* const sourcePtr, const T xSource, const T ySource, const int widthSource, const int heightSource, |
|
const int threadIdxX, const int threadIdxY) |
|
{ |
|
|
|
const T dx = xSource - fastTruncateCuda(int(floor(xSource)), 0, widthSource - 1); |
|
const T dy = ySource - fastTruncateCuda(int(floor(ySource)), 0, heightSource - 1); |
|
|
|
T temp[4]; |
|
for (unsigned char i = 0; i < 4; i++) |
|
{ |
|
const auto offset = 5 * (i + (threadIdxY > 3 ? 1 : 0)) + (threadIdxX > 3 ? 1 : 0); |
|
temp[i] = cubicInterpolate( |
|
sourcePtr[offset], sourcePtr[offset+1], sourcePtr[offset+2], |
|
sourcePtr[offset+3], dx); |
|
} |
|
return cubicInterpolate(temp[0], temp[1], temp[2], temp[3], dy); |
|
} |
|
|
|
template <typename T> |
|
inline __device__ T addWeighted(const T value1, const T value2, const T alphaValue2) |
|
{ |
|
return (1.f - alphaValue2) * value1 + alphaValue2 * value2; |
|
} |
|
|
|
template <typename T> |
|
inline __device__ void addColorWeighted( |
|
T& colorR, T& colorG, T& colorB, const T* const colorToAdd, const T alphaColorToAdd) |
|
{ |
|
colorR = addWeighted(colorR, colorToAdd[0], alphaColorToAdd); |
|
colorG = addWeighted(colorG, colorToAdd[1], alphaColorToAdd); |
|
colorB = addWeighted(colorB, colorToAdd[2], alphaColorToAdd); |
|
} |
|
} |
|
|
|
#endif |
|
|