Spaces:
Sleeping
Sleeping
Audio-Deepfake-Detection
/
fairseq-a54021305d6b3c4c5959ac9395135f63202db8f1
/fairseq
/modules
/cuda_utils.cu
/** | |
* Copyright (c) Facebook, Inc. and its affiliates. | |
* | |
* This source code is licensed under the MIT license found in the | |
* LICENSE file in the root directory of this source tree. | |
*/ | |
template <typename U, typename V> | |
constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { | |
return (a + b - 1) / b; | |
} | |
template <int FS, int SB, int padding_l, typename scalar_t> | |
__inline__ __device__ void zeroSharedMem(scalar_t* data) { | |
/* | |
Given an array of length FS + SB, zero out the first padding_l and last | |
(FS - padding_l) values in the array | |
*/ | |
int tid = threadIdx.x; | |
if (FS < SB) { | |
// zero all if we have enough threads in a block to do all of them | |
if (tid < padding_l || tid > SB - FS + padding_l - 1) { | |
data[tid] = scalar_t(0.0); | |
} | |
} else { | |
// otherwise zero out one block at a time | |
const int numIterations = divUp<int, int>(FS, SB); | |
for (int i = 0; i < numIterations; i++) { | |
int offset = i * SB; | |
if (tid + offset < padding_l) { | |
data[tid + offset] = scalar_t(0.0); | |
} else if (tid + offset < FS) { | |
data[SB + tid + offset] = scalar_t(0.0); | |
} | |
} | |
} | |
} | |
template <typename scalar_t> | |
__inline__ __device__ scalar_t warpReduce(scalar_t data) { | |
/* | |
Reduce an array within each warp. After processing all values in warp will | |
caontain the sum of all original values in that warp. | |
data - pointer to data to reduce | |
*/ | |
data += __shfl_xor_sync(SHFL_MASK, data, 16); | |
data += __shfl_xor_sync(SHFL_MASK, data, 8); | |
data += __shfl_xor_sync(SHFL_MASK, data, 4); | |
data += __shfl_xor_sync(SHFL_MASK, data, 2); | |
data += __shfl_xor_sync(SHFL_MASK, data, 1); | |
return data; | |
} | |
template <typename scalar_t> | |
__inline__ __device__ scalar_t blockReduce(scalar_t data) { | |
/* | |
Reduce an entire array on the block level. After processing, the | |
first value in the array will contain the reduced sum. | |
data - pointer to data to reduce | |
*/ | |
static __shared__ scalar_t warpSum[32]; | |
const int tid = threadIdx.x; | |
int wid = tid / 32; | |
int lane = tid % 32; | |
__syncthreads(); | |
// reduce each warp then write to shared memory | |
scalar_t sum = warpReduce(data); | |
if (lane == 0) { | |
warpSum[wid] = sum; | |
} | |
__syncthreads(); | |
scalar_t v; | |
// perform final sum of partial warp sums | |
if (tid < blockDim.x / 32) { | |
v = warpSum[lane]; | |
} else { | |
v = scalar_t(0.0); | |
} | |
if (wid == 0) { | |
v = warpReduce(v); | |
} | |
__syncthreads(); | |
return v; | |
} | |
void checkCudaStatus(cudaError_t status, int lineNumber = -1) { | |
if (status != cudaSuccess) { | |
std::cout << cudaGetErrorString(status) << " at line " << lineNumber | |
<< std::endl; | |
std::cout << "Exiting" << std::endl; | |
exit(1); | |
} | |
} | |
template <int FS, int SB, int padding_l, typename scalar_t> | |
__device__ void load_input_to_shared( | |
const scalar_t* input, // global memory | |
int inputOffset, | |
int sequenceLength, | |
int iteration, | |
int numIterations, | |
bool no_prev, | |
scalar_t* output /* shared memory */) { | |
/* | |
Load a block size of input into shared memory with | |
right and left overhang of total size FS. If previously | |
loaded memory, overlap will be shifted over to reduce | |
global memory access | |
input - pointer to start of channel sequence | |
inputOffset - how far in the sequence to start loading | |
sequenceLength - total length of sequence | |
iteration - which block of sequence we are loading | |
numIterations - total number of blocks to load | |
no_prev - whether to load the whole block if the previous block | |
wasn't loaded | |
output - shared memory to write input to | |
*/ | |
const int tid = threadIdx.x; | |
// Load the left "overhang" of input | |
if (iteration > 0) { | |
if (padding_l < SB) { | |
// load all at once | |
if (tid < padding_l) { | |
output[tid] = | |
(no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; | |
} | |
} else { | |
// load in chunks of size SB | |
int numIterations = divUp<int, int>(padding_l, SB); | |
for (int i = 0; i < numIterations; i++) { | |
int offset = i * SB; | |
if ((tid + offset) < padding_l) { | |
output[tid + offset] = (no_prev) | |
? input[inputOffset - padding_l + tid + offset] | |
: output[tid + offset + SB]; | |
} | |
} | |
} | |
} | |
// Load the right "overhang" of input | |
if (iteration < (numIterations - 1)) { | |
const int elementsLeft = sequenceLength - (iteration + 1) * SB; | |
if ((FS - padding_l) < SB) { | |
// load all at once | |
if (tid < (FS - padding_l)) { | |
output[padding_l + SB + tid] = (tid < elementsLeft) | |
? input[inputOffset + SB + tid] | |
: scalar_t(0.0); | |
} | |
} else { | |
// load in chunks of size SB | |
int numIterations = divUp<int, int>(FS - padding_l, SB); | |
for (int i = 0; i < numIterations; i++) { | |
int offset = i * SB; | |
if ((tid + offset) < (FS - padding_l)) { | |
output[padding_l + SB + tid + offset] = | |
((tid + offset) < elementsLeft) | |
? input[inputOffset + SB + tid + offset] | |
: scalar_t(0.0); | |
} | |
} | |
} | |
} | |
// We should also clear out the right "overhang" | |
if (iteration == (numIterations - 1)) { | |
if ((FS - padding_l) < SB) { | |
// clear out all at once | |
if (tid < (FS - padding_l)) { | |
output[padding_l + SB + tid] = scalar_t(0.0); | |
} | |
} else { | |
// clear in chunks of size SB | |
int numIterations = divUp<int, int>(FS - padding_l, SB); | |
for (int i = 0; i < numIterations; i++) { | |
int offset = i * SB; | |
if ((tid + offset) < (FS - padding_l)) { | |
output[padding_l + SB + tid + offset] = scalar_t(0.0); | |
} | |
} | |
} | |
} | |
output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) | |
? input[inputOffset + tid] | |
: scalar_t(0.0); | |
} | |