| /****************************************************************************** |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions are met: |
| * * Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * * Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * * Neither the name of the NVIDIA CORPORATION nor the |
| * names of its contributors may be used to endorse or promote products |
| * derived from this software without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| * |
| ******************************************************************************/ |
| |
| /****************************************************************************** |
| * Simple demonstration of cub::BlockScan |
| * |
| * To compile using the command line: |
| * nvcc -arch=sm_XX example_block_scan.cu -I../.. -lcudart -O3 |
| * |
| ******************************************************************************/ |
| |
| // Ensure printing of CUDA runtime errors to console (define before including cub.h) |
| #define CUB_STDERR |
| |
| #include <stdio.h> |
| #include <iostream> |
| |
| #include <cub/block/block_load.cuh> |
| #include <cub/block/block_store.cuh> |
| #include <cub/block/block_scan.cuh> |
| |
| #include "../../test/test_util.h" |
| |
| using namespace cub; |
| |
| //--------------------------------------------------------------------- |
| // Globals, constants and typedefs |
| //--------------------------------------------------------------------- |
| |
| /// Verbose output |
| bool g_verbose = false; |
| |
| /// Timing iterations |
| int g_timing_iterations = 100; |
| |
| /// Default grid size |
| int g_grid_size = 1; |
| |
| |
| |
| //--------------------------------------------------------------------- |
| // Kernels |
| //--------------------------------------------------------------------- |
| |
| /** |
| * Simple kernel for performing a block-wide exclusive prefix sum over integers |
| */ |
| template < |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockScanAlgorithm ALGORITHM> |
| __global__ void BlockPrefixSumKernel( |
| int *d_in, // Tile of input |
| int *d_out, // Tile of output |
| clock_t *d_elapsed) // Elapsed cycle count of block scan |
| { |
| // Specialize BlockLoad type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) |
| typedef BlockLoad<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoadT; |
| |
| // Specialize BlockStore type for our thread block (uses warp-striped loads for coalescing, then transposes in shared memory to a blocked arrangement) |
| typedef BlockStore<int, BLOCK_THREADS, ITEMS_PER_THREAD, BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT; |
| |
| // Specialize BlockScan type for our thread block |
| typedef BlockScan<int, BLOCK_THREADS, ALGORITHM> BlockScanT; |
| |
| // Shared memory |
| __shared__ union TempStorage |
| { |
| typename BlockLoadT::TempStorage load; |
| typename BlockStoreT::TempStorage store; |
| typename BlockScanT::TempStorage scan; |
| } temp_storage; |
| |
| // Per-thread tile data |
| int data[ITEMS_PER_THREAD]; |
| |
| // Load items into a blocked arrangement |
| BlockLoadT(temp_storage.load).Load(d_in, data); |
| |
| // Barrier for smem reuse |
| __syncthreads(); |
| |
| // Start cycle timer |
| clock_t start = clock(); |
| |
| // Compute exclusive prefix sum |
| int aggregate; |
| BlockScanT(temp_storage.scan).ExclusiveSum(data, data, aggregate); |
| |
| // Stop cycle timer |
| clock_t stop = clock(); |
| |
| // Barrier for smem reuse |
| __syncthreads(); |
| |
| // Store items from a blocked arrangement |
| BlockStoreT(temp_storage.store).Store(d_out, data); |
| |
| // Store aggregate and elapsed clocks |
| if (threadIdx.x == 0) |
| { |
| *d_elapsed = (start > stop) ? start - stop : stop - start; |
| d_out[BLOCK_THREADS * ITEMS_PER_THREAD] = aggregate; |
| } |
| } |
| |
|
|
|
|
| //--------------------------------------------------------------------- |
| // Host utilities |
| //--------------------------------------------------------------------- |
|
|
| /** |
| * Initialize exclusive prefix sum problem (and solution). |
| * Returns the aggregate |
| */ |
| int Initialize( |
| int *h_in, |
| int *h_reference, |
| int num_items) |
| { |
| int inclusive = 0; |
| |
| for (int i = 0; i < num_items; ++i) |
| { |
| h_in[i] = i % 17; |
| |
| h_reference[i] = inclusive; |
| inclusive += h_in[i]; |
| } |
| |
| return inclusive; |
| } |
| |
|
|
| /** |
| * Test thread block scan |
| */ |
| template < |
| int BLOCK_THREADS, |
| int ITEMS_PER_THREAD, |
| BlockScanAlgorithm ALGORITHM> |
| void Test() |
| { |
| const int TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD; |
|
|
| // Allocate host arrays |
| int *h_in = new int[TILE_SIZE]; |
| int *h_reference = new int[TILE_SIZE]; |
| int *h_gpu = new int[TILE_SIZE + 1]; |
| |
| // Initialize problem and reference output on host |
| int h_aggregate = Initialize(h_in, h_reference, TILE_SIZE); |
| |
| // Initialize device arrays |
| int *d_in = NULL; |
| int *d_out = NULL; |
| clock_t *d_elapsed = NULL; |
| cudaMalloc((void**)&d_in, sizeof(int) * TILE_SIZE); |
| cudaMalloc((void**)&d_out, sizeof(int) * (TILE_SIZE + 1)); |
| cudaMalloc((void**)&d_elapsed, sizeof(clock_t)); |
| |
| // Display input problem data |
| if (g_verbose) |
| { |
| printf("Input data: "); |
| for (int i = 0; i < TILE_SIZE; i++) |
| printf("%d, ", h_in[i]); |
| printf("\n\n"); |
| } |
| |
| // Kernel props |
| int max_sm_occupancy; |
| CubDebugExit(MaxSmOccupancy(max_sm_occupancy, BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM>, BLOCK_THREADS)); |
| |
| // Copy problem to device |
| cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); |
| |
| printf("BlockScan algorithm %s on %d items (%d timing iterations, %d blocks, %d threads, %d items per thread, %d SM occupancy):\n", |
| (ALGORITHM == BLOCK_SCAN_RAKING) ? "BLOCK_SCAN_RAKING" : (ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE) ? "BLOCK_SCAN_RAKING_MEMOIZE" : "BLOCK_SCAN_WARP_SCANS", |
| TILE_SIZE, g_timing_iterations, g_grid_size, BLOCK_THREADS, ITEMS_PER_THREAD, max_sm_occupancy); |
| |
| // Run aggregate/prefix kernel |
| BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>( |
| d_in, |
| d_out, |
| d_elapsed); |
| |
| // Check results |
| printf("\tOutput items: "); |
| int compare = CompareDeviceResults(h_reference, d_out, TILE_SIZE, g_verbose, g_verbose); |
| printf("%s\n", compare ? "FAIL" : "PASS"); |
| AssertEquals(0, compare); |
| |
| // Check total aggregate |
| printf("\tAggregate: "); |
| compare = CompareDeviceResults(&h_aggregate, d_out + TILE_SIZE, 1, g_verbose, g_verbose); |
| printf("%s\n", compare ? "FAIL" : "PASS"); |
| AssertEquals(0, compare); |
| |
| // Run this several times and average the performance results |
| GpuTimer timer; |
| float elapsed_millis = 0.0; |
| clock_t elapsed_clocks = 0; |
| |
| for (int i = 0; i < g_timing_iterations; ++i) |
| { |
| // Copy problem to device |
| cudaMemcpy(d_in, h_in, sizeof(int) * TILE_SIZE, cudaMemcpyHostToDevice); |
| |
| timer.Start(); |
| |
| // Run aggregate/prefix kernel |
| BlockPrefixSumKernel<BLOCK_THREADS, ITEMS_PER_THREAD, ALGORITHM><<<g_grid_size, BLOCK_THREADS>>>( |
| d_in, |
| d_out, |
| d_elapsed); |
| |
| timer.Stop(); |
| elapsed_millis += timer.ElapsedMillis(); |
| |
| // Copy clocks from device |
| clock_t clocks; |
| CubDebugExit(cudaMemcpy(&clocks, d_elapsed, sizeof(clock_t), cudaMemcpyDeviceToHost)); |
| elapsed_clocks += clocks; |
| |
| } |
| |
| // Check for kernel errors and STDIO from the kernel, if any |
| CubDebugExit(cudaPeekAtLastError()); |
| CubDebugExit(cudaDeviceSynchronize()); |
| |
| // Display timing results |
| float avg_millis = elapsed_millis / g_timing_iterations; |
| float avg_items_per_sec = float(TILE_SIZE * g_grid_size) / avg_millis / 1000.0f; |
| float avg_clocks = float(elapsed_clocks) / g_timing_iterations; |
| float avg_clocks_per_item = avg_clocks / TILE_SIZE; |
| |
| printf("\tAverage BlockScan::Sum clocks: %.3f\n", avg_clocks); |
| printf("\tAverage BlockScan::Sum clocks per item: %.3f\n", avg_clocks_per_item); |
| printf("\tAverage kernel millis: %.4f\n", avg_millis); |
| printf("\tAverage million items / sec: %.4f\n", avg_items_per_sec); |
| |
| // Cleanup |
| if (h_in) delete[] h_in; |
| if (h_reference) delete[] h_reference; |
| if (h_gpu) delete[] h_gpu; |
| if (d_in) cudaFree(d_in); |
| if (d_out) cudaFree(d_out); |
| if (d_elapsed) cudaFree(d_elapsed); |
| } |
| |
|
|
| /** |
| * Main |
| */ |
| int main(int argc, char** argv) |
| { |
| // Initialize command line |
| CommandLineArgs args(argc, argv); |
| g_verbose = args.CheckCmdLineFlag("v"); |
| args.GetCmdLineArgument("i", g_timing_iterations); |
| args.GetCmdLineArgument("grid-size", g_grid_size); |
| |
| // Print usage |
| if (args.CheckCmdLineFlag("help")) |
| { |
| printf("%s " |
| "[--device=<device-id>] " |
| "[--i=<timing iterations (default:%d)>]" |
| "[--grid-size=<grid size (default:%d)>]" |
| "[--v] " |
| "\n", argv[0], g_timing_iterations, g_grid_size); |
| exit(0); |
| } |
| |
| // Initialize device |
| CubDebugExit(args.DeviceInit()); |
| |
| // Run tests |
| Test<1024, 1, BLOCK_SCAN_RAKING>(); |
| Test<512, 2, BLOCK_SCAN_RAKING>(); |
| Test<256, 4, BLOCK_SCAN_RAKING>(); |
| Test<128, 8, BLOCK_SCAN_RAKING>(); |
| Test<64, 16, BLOCK_SCAN_RAKING>(); |
| Test<32, 32, BLOCK_SCAN_RAKING>(); |
| |
| printf("-------------\n"); |
| |
| Test<1024, 1, BLOCK_SCAN_RAKING_MEMOIZE>(); |
| Test<512, 2, BLOCK_SCAN_RAKING_MEMOIZE>(); |
| Test<256, 4, BLOCK_SCAN_RAKING_MEMOIZE>(); |
| Test<128, 8, BLOCK_SCAN_RAKING_MEMOIZE>(); |
| Test<64, 16, BLOCK_SCAN_RAKING_MEMOIZE>(); |
| Test<32, 32, BLOCK_SCAN_RAKING_MEMOIZE>(); |
| |
| printf("-------------\n"); |
| |
| Test<1024, 1, BLOCK_SCAN_WARP_SCANS>(); |
| Test<512, 2, BLOCK_SCAN_WARP_SCANS>(); |
| Test<256, 4, BLOCK_SCAN_WARP_SCANS>(); |
| Test<128, 8, BLOCK_SCAN_WARP_SCANS>(); |
| Test<64, 16, BLOCK_SCAN_WARP_SCANS>(); |
| Test<32, 32, BLOCK_SCAN_WARP_SCANS>(); |
| |
|
|
| return 0; |
| } |
| |
|
|