Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /dependencies /cub /test /test_device_select_unique.cu

Xu Ma

update

1c3c0d9 over 2 years ago

21.8 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/******************************************************************************
	* Test of DeviceSelect::Unique utilities
	******************************************************************************/

	// Ensure printing of CUDA runtime errors to console
	#define CUB_STDERR

	#include <stdio.h>
	#include <typeinfo>

	#include <thrust/device_ptr.h>
	#include <thrust/unique.h>

	#include <cub/util_allocator.cuh>
	#include <cub/iterator/counting_input_iterator.cuh>
	#include <cub/device/device_select.cuh>

	#include <thrust/device_ptr.h>
	#include <thrust/unique.h>

	#include "test_util.h"

	using namespace cub;


	//---------------------------------------------------------------------
	// Globals, constants and typedefs
	//---------------------------------------------------------------------

	bool g_verbose = false;
	int g_timing_iterations = 0;
	int g_repeat = 0;
	float g_device_giga_bandwidth;
	CachingDeviceAllocator g_allocator(true);

	// Dispatch types
	enum Backend
	{
	CUB, // CUB method
	THRUST, // Thrust method
	CDP, // GPU-based (dynamic parallelism) dispatch to CUB method
	};


	//---------------------------------------------------------------------
	// Dispatch to different CUB DeviceSelect entrypoints
	//---------------------------------------------------------------------


	/**
	* Dispatch to unique entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Dispatch(
	Int2Type<CUB> /dispatch_to/,
	int timing_timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void* d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	OffsetT num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	cudaError_t error = cudaSuccess;
	for (int i = 0; i < timing_timing_iterations; ++i)
	{
	error = DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, stream, debug_synchronous);
	}
	return error;
	}


	//---------------------------------------------------------------------
	// Dispatch to different Thrust entrypoints
	//---------------------------------------------------------------------


	/**
	* Dispatch to unique entrypoint
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
	__host__ __forceinline__
	cudaError_t Dispatch(
	Int2Type<THRUST> /dispatch_to/,
	int timing_timing_iterations,
	size_t /d_temp_storage_bytes*/,
	cudaError_t /d_cdp_error*/,

	void *d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	OffsetT num_items,
	cudaStream_t /stream/,
	bool /debug_synchronous/)
	{
	// The input value type
	typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;

	// The output value type
	typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ?
	typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's value type,
	typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT; // ... else the output iterator's value type

	if (d_temp_storage == 0)
	{
	temp_storage_bytes = 1;
	}
	else
	{
	thrust::device_ptr<OutputT> d_out_wrapper_end;
	thrust::device_ptr<InputT> d_in_wrapper(d_in);
	thrust::device_ptr<OutputT> d_out_wrapper(d_out);
	for (int i = 0; i < timing_timing_iterations; ++i)
	{
	d_out_wrapper_end = thrust::unique_copy(d_in_wrapper, d_in_wrapper + num_items, d_out_wrapper);
	}

	OffsetT num_selected = OffsetT(d_out_wrapper_end - d_out_wrapper);
	CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice));

	}

	return cudaSuccess;
	}



	//---------------------------------------------------------------------
	// CUDA Nested Parallelism Test Kernel
	//---------------------------------------------------------------------

	/**
	* Simple wrapper kernel to invoke DeviceSelect
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
	__global__ void CnpDispatchKernel(
	int timing_timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void* d_temp_storage,
	size_t temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	OffsetT num_items,
	bool debug_synchronous)
	{

	#ifndef CUB_CDP
	(void)timing_timing_iterations;
	(void)d_temp_storage_bytes;
	(void)d_cdp_error;
	(void)d_temp_storage;
	(void)temp_storage_bytes;
	(void)d_in;
	(void)d_out;
	(void)d_num_selected_out;
	(void)num_items;
	(void)debug_synchronous;
	*d_cdp_error = cudaErrorNotSupported;
	#else
	*d_cdp_error = Dispatch(Int2Type<CUB>(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
	d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, debug_synchronous);
	*d_temp_storage_bytes = temp_storage_bytes;
	#endif
	}


	/**
	* Dispatch to CDP kernel
	*/
	template <typename InputIteratorT, typename OutputIteratorT, typename NumSelectedIteratorT, typename OffsetT>
	cudaError_t Dispatch(
	Int2Type<CDP> dispatch_to,
	int timing_timing_iterations,
	size_t *d_temp_storage_bytes,
	cudaError_t *d_cdp_error,

	void* d_temp_storage,
	size_t &temp_storage_bytes,
	InputIteratorT d_in,
	OutputIteratorT d_out,
	NumSelectedIteratorT d_num_selected_out,
	OffsetT num_items,
	cudaStream_t stream,
	bool debug_synchronous)
	{
	// Invoke kernel to invoke device-side dispatch
	CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error,
	d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, debug_synchronous);

	// Copy out temp_storage_bytes
	CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost));

	// Copy out error
	cudaError_t retval;
	CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost));
	return retval;
	}



	//---------------------------------------------------------------------
	// Test generation
	//---------------------------------------------------------------------


	/**
	* Initialize problem
	*/
	template <typename T>
	void Initialize(
	int entropy_reduction,
	T *h_in,
	int num_items,
	int max_segment)
	{
	unsigned int max_int = (unsigned int) -1;

	int key = 0;
	int i = 0;
	while (i < num_items)
	{
	// Select number of repeating occurrences for the current run
	int repeat;
	if (max_segment < 0)
	{
	repeat = num_items;
	}
	else if (max_segment < 2)
	{
	repeat = 1;
	}
	else
	{
	RandomBits(repeat, entropy_reduction);
	repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int));
	repeat = CUB_MAX(1, repeat);
	}

	int j = i;
	while (j < CUB_MIN(i + repeat, num_items))
	{
	InitValue(INTEGER_SEED, h_in[j], key);
	j++;
	}

	i = j;
	key++;
	}

	if (g_verbose)
	{
	printf("Input:\n");
	DisplayResults(h_in, num_items);
	printf("\n\n");
	}
	}


	/**
	* Solve unique problem
	*/
	template <
	typename InputIteratorT,
	typename T>
	int Solve(
	InputIteratorT h_in,
	T *h_reference,
	int num_items)
	{
	int num_selected = 0;
	if (num_items > 0)
	{
	h_reference[num_selected] = h_in[0];
	num_selected++;
	}

	for (int i = 1; i < num_items; ++i)
	{
	if (h_in[i] != h_in[i - 1])
	{
	h_reference[num_selected] = h_in[i];
	num_selected++;
	}
	}

	return num_selected;
	}



	/**
	* Test DeviceSelect for a given problem input
	*/
	template <
	Backend BACKEND,
	typename DeviceInputIteratorT,
	typename T>
	void Test(
	DeviceInputIteratorT d_in,
	T *h_reference,
	int num_selected,
	int num_items)
	{
	// Allocate device output array and num selected
	T *d_out = NULL;
	int *d_num_selected_out = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_out, sizeof(T) num_items));
	CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int)));

	// Allocate CDP device arrays
	size_t *d_temp_storage_bytes = NULL;
	cudaError_t *d_cdp_error = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_temp_storage_bytes, sizeof(size_t) 1));
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_cdp_error, sizeof(cudaError_t) 1));

	// Allocate temporary storage
	void *d_temp_storage = NULL;
	size_t temp_storage_bytes = 0;
	CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));
	CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

	// Clear device output array
	CubDebugExit(cudaMemset(d_out, 0, sizeof(T) * num_items));
	CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int)));

	// Run warmup/correctness iteration
	CubDebugExit(Dispatch(Int2Type<BACKEND>(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, true));

	// Check for correctness (and display results, if specified)
	int compare1 = CompareDeviceResults(h_reference, d_out, num_selected, true, g_verbose);
	printf("\t Data %s ", compare1 ? "FAIL" : "PASS");

	int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose);
	printf("\t Count %s ", compare2 ? "FAIL" : "PASS");

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Performance
	GpuTimer gpu_timer;
	gpu_timer.Start();
	CubDebugExit(Dispatch(Int2Type<BACKEND>(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, 0, false));
	gpu_timer.Stop();
	float elapsed_millis = gpu_timer.ElapsedMillis();

	// Display performance
	if (g_timing_iterations > 0)
	{
	float avg_millis = elapsed_millis / g_timing_iterations;
	float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f;
	float giga_bandwidth = float((num_items + num_selected) * sizeof(T)) / avg_millis / 1000.0f / 1000.0f;
	printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0);
	}
	printf("\n\n");

	// Flush any stdout/stderr
	fflush(stdout);
	fflush(stderr);

	// Cleanup
	if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
	if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out));
	if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes));
	if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error));
	if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

	// Correctness asserts
	AssertEquals(0, compare1 \| compare2);
	}


	/**
	* Test DeviceSelect on pointer type
	*/
	template <
	Backend BACKEND,
	typename T>
	void TestPointer(
	int num_items,
	int entropy_reduction,
	int max_segment)
	{
	// Allocate host arrays
	T* h_in = new T[num_items];
	T* h_reference = new T[num_items];

	// Initialize problem and solution
	Initialize(entropy_reduction, h_in, num_items, max_segment);
	int num_selected = Solve(h_in, h_reference, num_items);

	printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n",
	(BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
	num_items, num_selected, float(num_items) / num_selected,
	typeid(T).name(),
	(int) sizeof(T),
	entropy_reduction);
	fflush(stdout);

	// Allocate problem device arrays
	T *d_in = NULL;
	CubDebugExit(g_allocator.DeviceAllocate((void*)&d_in, sizeof(T) num_items));

	// Initialize device input
	CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(T) * num_items, cudaMemcpyHostToDevice));

	// Run Test
	Test<BACKEND>(d_in, h_reference, num_selected, num_items);

	// Cleanup
	if (h_in) delete[] h_in;
	if (h_reference) delete[] h_reference;
	if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
	}


	/**
	* Test DeviceSelect on iterator type
	*/
	template <
	Backend BACKEND,
	typename T>
	void TestIterator(
	int num_items)
	{
	// Use a counting iterator as the input
	CountingInputIterator<T, int> h_in(0);

	// Allocate host arrays
	T* h_reference = new T[num_items];

	// Initialize problem and solution
	int num_selected = Solve(h_in, h_reference, num_items);

	printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n",
	(BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB",
	num_items, num_selected, float(num_items) / num_selected,
	typeid(T).name(),
	(int) sizeof(T));
	fflush(stdout);

	// Run Test
	Test<BACKEND>(h_in, h_reference, num_selected, num_items);

	// Cleanup
	if (h_reference) delete[] h_reference;
	}


	/**
	* Test different gen modes
	*/
	template <
	Backend BACKEND,
	typename T>
	void Test(
	int num_items)
	{
	for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11)
	{
	TestPointer<BACKEND, T>(num_items, 0, max_segment);
	TestPointer<BACKEND, T>(num_items, 2, max_segment);
	TestPointer<BACKEND, T>(num_items, 7, max_segment);
	}
	}


	/**
	* Test different dispatch
	*/
	template <
	typename T>
	void TestOp(
	int num_items)
	{
	Test<CUB, T>(num_items);
	#ifdef CUB_CDP
	Test<CDP, T>(num_items);
	#endif
	}


	/**
	* Test different input sizes
	*/
	template <typename T>
	void Test(
	int num_items)
	{
	if (num_items < 0)
	{
	TestOp<T>(0);
	TestOp<T>(1);
	TestOp<T>(100);
	TestOp<T>(10000);
	TestOp<T>(1000000);
	}
	else
	{
	TestOp<T>(num_items);
	}
	}



	//---------------------------------------------------------------------
	// Main
	//---------------------------------------------------------------------

	/**
	* Main
	*/
	int main(int argc, char** argv)
	{
	int num_items = -1;
	int entropy_reduction = 0;
	int maxseg = 1000;

	// Initialize command line
	CommandLineArgs args(argc, argv);
	g_verbose = args.CheckCmdLineFlag("v");
	args.GetCmdLineArgument("n", num_items);
	args.GetCmdLineArgument("i", g_timing_iterations);
	args.GetCmdLineArgument("repeat", g_repeat);
	args.GetCmdLineArgument("maxseg", maxseg);
	args.GetCmdLineArgument("entropy", entropy_reduction);

	// Print usage
	if (args.CheckCmdLineFlag("help"))
	{
	printf("%s "
	"[--n=<input items> "
	"[--i=<timing iterations> "
	"[--device=<device-id>] "
	"[--maxseg=<max segment length>]"
	"[--entropy=<segment length bit entropy reduction rounds>]"
	"[--repeat=<repetitions of entire test suite>]"
	"[--v] "
	"[--cdp]"
	"\n", argv[0]);
	exit(0);
	}

	// Initialize device
	CubDebugExit(args.DeviceInit());
	g_device_giga_bandwidth = args.device_giga_bandwidth;
	printf("\n");

	#ifdef QUICKER_TEST

	// Compile/run basic CUB test
	if (num_items < 0) num_items = 32000000;
	TestPointer<CUB, int>( num_items, entropy_reduction, maxseg);

	#elif defined(QUICK_TEST)

	// Get device ordinal
	int device_ordinal;
	CubDebugExit(cudaGetDevice(&device_ordinal));

	// Get device SM version
	int sm_version;
	CubDebugExit(SmVersion(sm_version, device_ordinal));

	// Compile/run quick tests
	if (num_items < 0) num_items = 32000000;

	printf("-- Iterator ----------------------------\n");
	TestIterator<CUB, int>( num_items);

	printf("----------------------------\n");
	TestPointer<CUB, char>( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);
	TestPointer<THRUST, char>( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg);

	printf("----------------------------\n");
	TestPointer<CUB, short>( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);
	TestPointer<THRUST, short>( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg);

	printf("----------------------------\n");
	TestPointer<CUB, int>( num_items, entropy_reduction, maxseg);
	TestPointer<THRUST, int>( num_items, entropy_reduction, maxseg);

	printf("----------------------------\n");
	TestPointer<CUB, long long>( num_items / 2, entropy_reduction, maxseg);
	TestPointer<THRUST, long long>(num_items / 2, entropy_reduction, maxseg);

	printf("----------------------------\n");
	TestPointer<CUB, TestFoo>( num_items / 4, entropy_reduction, maxseg);
	TestPointer<THRUST, TestFoo>( num_items / 4, entropy_reduction, maxseg);

	#else

	// Compile/run thorough tests
	for (int i = 0; i <= g_repeat; ++i)
	{
	// Test different input types
	Test<unsigned char>(num_items);
	Test<unsigned short>(num_items);
	Test<unsigned int>(num_items);
	Test<unsigned long long>(num_items);

	Test<uchar2>(num_items);
	Test<ushort2>(num_items);
	Test<uint2>(num_items);
	Test<ulonglong2>(num_items);

	Test<uchar4>(num_items);
	Test<ushort4>(num_items);
	Test<uint4>(num_items);
	Test<ulonglong4>(num_items);

	Test<TestFoo>(num_items);
	Test<TestBar>(num_items);
	}

	#endif

	return 0;
	}