Androidonnxfork
/

test

Model card Files Files and versions Community

test / build /pthreadpool-source /include /pthreadpool.h

Androidonnxfork

Upload folder using huggingface_hub

8b7c501 over 1 year ago

raw

history blame

84 kB

	#ifndef PTHREADPOOL_H_
	#define PTHREADPOOL_H_

	#include <stddef.h>
	#include <stdint.h>

	typedef struct pthreadpool* pthreadpool_t;

	typedef void (pthreadpool_task_1d_t)(void, size_t);
	typedef void (pthreadpool_task_1d_tile_1d_t)(void, size_t, size_t);
	typedef void (pthreadpool_task_2d_t)(void, size_t, size_t);
	typedef void (pthreadpool_task_2d_tile_1d_t)(void, size_t, size_t, size_t);
	typedef void (pthreadpool_task_2d_tile_2d_t)(void, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_3d_t)(void, size_t, size_t, size_t);
	typedef void (pthreadpool_task_3d_tile_1d_t)(void, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_3d_tile_2d_t)(void, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_4d_t)(void, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_4d_tile_1d_t)(void, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_4d_tile_2d_t)(void, size_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_5d_t)(void, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_5d_tile_1d_t)(void, size_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_5d_tile_2d_t)(void, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_6d_t)(void, size_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_6d_tile_1d_t)(void, size_t, size_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_6d_tile_2d_t)(void, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);

	typedef void (pthreadpool_task_1d_with_id_t)(void, uint32_t, size_t);
	typedef void (pthreadpool_task_2d_tile_2d_with_id_t)(void, uint32_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_3d_tile_2d_with_id_t)(void, uint32_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_task_4d_tile_2d_with_id_t)(void, uint32_t, size_t, size_t, size_t, size_t, size_t, size_t);


	/**
	* Disable support for denormalized numbers to the maximum extent possible for
	* the duration of the computation.
	*
	* Handling denormalized floating-point numbers is often implemented in
	* microcode, and incurs significant performance degradation. This hint
	* instructs the thread pool to disable support for denormalized numbers before
	* running the computation by manipulating architecture-specific control
	* registers, and restore the initial value of control registers after the
	* computation is complete. The thread pool temporary disables denormalized
	* numbers on all threads involved in the computation (i.e. the caller threads,
	* and potentially worker threads).
	*
	* Disabling denormalized numbers may have a small negative effect on results'
	* accuracy. As various architectures differ in capabilities to control
	* processing of denormalized numbers, using this flag may also hurt results'
	* reproducibility across different instruction set architectures.
	*/
	#define PTHREADPOOL_FLAG_DISABLE_DENORMALS 0x00000001

	/**
	* Yield worker threads to the system scheduler after the operation is finished.
	*
	* Force workers to use kernel wait (instead of active spin-wait by default) for
	* new commands after this command is processed. This flag affects only the
	* immediate next operation on this thread pool. To make the thread pool always
	* use kernel wait, pass this flag to all parallelization functions.
	*/
	#define PTHREADPOOL_FLAG_YIELD_WORKERS 0x00000002

	#ifdef __cplusplus
	extern "C" {
	#endif

	/**
	* Create a thread pool with the specified number of threads.
	*
	* @param threads_count the number of threads in the thread pool.
	* A value of 0 has special interpretation: it creates a thread pool with as
	* many threads as there are logical processors in the system.
	*
	* @returns A pointer to an opaque thread pool object if the call is
	* successful, or NULL pointer if the call failed.
	*/
	pthreadpool_t pthreadpool_create(size_t threads_count);

	/**
	* Query the number of threads in a thread pool.
	*
	* @param threadpool the thread pool to query.
	*
	* @returns The number of threads in the thread pool.
	*/
	size_t pthreadpool_get_threads_count(pthreadpool_t threadpool);

	/**
	* Process items on a 1D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range; i++)
	* function(context, i);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each item.
	* @param context the first argument passed to the specified function.
	* @param range the number of items on the 1D grid to process. The
	* specified function will be called once for each item.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_1d_t function,
	void* context,
	size_t range,
	uint32_t flags);

	/**
	* Process items on a 1D grid using a microarchitecture-aware task function.
	*
	* The function implements a parallel version of the following snippet:
	*
	* uint32_t uarch_index = cpuinfo_initialize() ?
	* cpuinfo_get_current_uarch_index() : default_uarch_index;
	* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
	* for (size_t i = 0; i < range; i++)
	* function(context, uarch_index, i);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If
	* threadpool is NULL, all items are processed serially on the calling
	* thread.
	* @param function the function to call for each item.
	* @param context the first argument passed to the specified
	* function.
	* @param default_uarch_index the microarchitecture index to use when
	* pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
	* or index returned by cpuinfo_get_current_uarch_index() exceeds the
	* max_uarch_index value.
	* @param max_uarch_index the maximum microarchitecture index expected by
	* the specified function. If the index returned by
	* cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
	* will be used instead. default_uarch_index can exceed max_uarch_index.
	* @param range the number of items on the 1D grid to process.
	* The specified function will be called once for each item.
	* @param flags a bitwise combination of zero or more optional
	* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
	* PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_1d_with_uarch(
	pthreadpool_t threadpool,
	pthreadpool_task_1d_with_id_t function,
	void* context,
	uint32_t default_uarch_index,
	uint32_t max_uarch_index,
	size_t range,
	uint32_t flags);

	/**
	* Process items on a 1D grid with specified maximum tile size.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range; i += tile)
	* function(context, i, min(range - i, tile));
	*
	* When the call returns, all items have been processed and the thread pool is
	* ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool,
	* the calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range the number of items on the 1D grid to process.
	* @param tile the maximum number of items on the 1D grid to process in
	* one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_1d_tile_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_1d_tile_1d_t function,
	void* context,
	size_t range,
	size_t tile,
	uint32_t flags);

	/**
	* Process items on a 2D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* function(context, i, j);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each item.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 2D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 2D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_2d(
	pthreadpool_t threadpool,
	pthreadpool_task_2d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	uint32_t flags);

	/**
	* Process items on a 2D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* function(context, i, j, min(range_j - j, tile_j));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 2D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 2D grid.
	* @param tile_j the maximum number of items along the second dimension of
	* the 2D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_2d_tile_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_2d_tile_1d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t tile_j,
	uint32_t flags);

	/**
	* Process items on a 2D grid with the specified maximum tile size along each
	* grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i += tile_i)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* function(context, i, j,
	* min(range_i - i, tile_i), min(range_j - j, tile_j));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 2D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 2D grid.
	* @param tile_j the maximum number of items along the first dimension of
	* the 2D grid to process in one function call.
	* @param tile_j the maximum number of items along the second dimension of
	* the 2D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_2d_tile_2d(
	pthreadpool_t threadpool,
	pthreadpool_task_2d_tile_2d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t tile_i,
	size_t tile_j,
	uint32_t flags);

	/**
	* Process items on a 2D grid with the specified maximum tile size along each
	* grid dimension using a microarchitecture-aware task function.
	*
	* The function implements a parallel version of the following snippet:
	*
	* uint32_t uarch_index = cpuinfo_initialize() ?
	* cpuinfo_get_current_uarch_index() : default_uarch_index;
	* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
	* for (size_t i = 0; i < range_i; i += tile_i)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* function(context, uarch_index, i, j,
	* min(range_i - i, tile_i), min(range_j - j, tile_j));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If
	* threadpool is NULL, all items are processed serially on the calling
	* thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified
	* function.
	* @param default_uarch_index the microarchitecture index to use when
	* pthreadpool is configured without cpuinfo,
	* cpuinfo initialization failed, or index returned
	* by cpuinfo_get_current_uarch_index() exceeds
	* the max_uarch_index value.
	* @param max_uarch_index the maximum microarchitecture index expected
	* by the specified function. If the index returned
	* by cpuinfo_get_current_uarch_index() exceeds this
	* value, default_uarch_index will be used instead.
	* default_uarch_index can exceed max_uarch_index.
	* @param range_i the number of items to process along the first
	* dimension of the 2D grid.
	* @param range_j the number of items to process along the second
	* dimension of the 2D grid.
	* @param tile_j the maximum number of items along the first
	* dimension of the 2D grid to process in one function call.
	* @param tile_j the maximum number of items along the second
	* dimension of the 2D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional
	* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
	* PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_2d_tile_2d_with_uarch(
	pthreadpool_t threadpool,
	pthreadpool_task_2d_tile_2d_with_id_t function,
	void* context,
	uint32_t default_uarch_index,
	uint32_t max_uarch_index,
	size_t range_i,
	size_t range_j,
	size_t tile_i,
	size_t tile_j,
	uint32_t flags);

	/**
	* Process items on a 3D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* function(context, i, j, k);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 3D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 3D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 3D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_3d(
	pthreadpool_t threadpool,
	pthreadpool_task_3d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	uint32_t flags);

	/**
	* Process items on a 3D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* function(context, i, j, k, min(range_k - k, tile_k));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 3D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 3D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 3D grid.
	* @param tile_k the maximum number of items along the third dimension of
	* the 3D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_3d_tile_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_3d_tile_1d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t tile_k,
	uint32_t flags);

	/**
	* Process items on a 3D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* function(context, i, j, k,
	* min(range_j - j, tile_j), min(range_k - k, tile_k));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 3D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 3D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 3D grid.
	* @param tile_j the maximum number of items along the second dimension of
	* the 3D grid to process in one function call.
	* @param tile_k the maximum number of items along the third dimension of
	* the 3D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_3d_tile_2d(
	pthreadpool_t threadpool,
	pthreadpool_task_3d_tile_2d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t tile_j,
	size_t tile_k,
	uint32_t flags);

	/**
	* Process items on a 3D grid with the specified maximum tile size along the
	* last two grid dimensions using a microarchitecture-aware task function.
	*
	* The function implements a parallel version of the following snippet:
	*
	* uint32_t uarch_index = cpuinfo_initialize() ?
	* cpuinfo_get_current_uarch_index() : default_uarch_index;
	* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* function(context, uarch_index, i, j, k,
	* min(range_j - j, tile_j), min(range_k - k, tile_k));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If
	* threadpool is NULL, all items are processed serially on the calling
	* thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified
	* function.
	* @param default_uarch_index the microarchitecture index to use when
	* pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
	* or index returned by cpuinfo_get_current_uarch_index() exceeds the
	* max_uarch_index value.
	* @param max_uarch_index the maximum microarchitecture index expected by
	* the specified function. If the index returned by
	* cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
	* will be used instead. default_uarch_index can exceed max_uarch_index.
	* @param range_i the number of items to process along the first
	* dimension of the 3D grid.
	* @param range_j the number of items to process along the second
	* dimension of the 3D grid.
	* @param range_k the number of items to process along the third
	* dimension of the 3D grid.
	* @param tile_j the maximum number of items along the second
	* dimension of the 3D grid to process in one function call.
	* @param tile_k the maximum number of items along the third
	* dimension of the 3D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional
	* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
	* PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_3d_tile_2d_with_uarch(
	pthreadpool_t threadpool,
	pthreadpool_task_3d_tile_2d_with_id_t function,
	void* context,
	uint32_t default_uarch_index,
	uint32_t max_uarch_index,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t tile_j,
	size_t tile_k,
	uint32_t flags);

	/**
	* Process items on a 4D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* function(context, i, j, k, l);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 4D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 4D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 4D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 4D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_4d(
	pthreadpool_t threadpool,
	pthreadpool_task_4d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	uint32_t flags);

	/**
	* Process items on a 4D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* function(context, i, j, k, l, min(range_l - l, tile_l));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 4D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 4D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 4D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 4D grid.
	* @param tile_l the maximum number of items along the fourth dimension of
	* the 4D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_4d_tile_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_4d_tile_1d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t tile_l,
	uint32_t flags);

	/**
	* Process items on a 4D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* function(context, i, j, k, l,
	* min(range_k - k, tile_k), min(range_l - l, tile_l));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 4D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 4D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 4D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 4D grid.
	* @param tile_k the maximum number of items along the third dimension of
	* the 4D grid to process in one function call.
	* @param tile_l the maximum number of items along the fourth dimension of
	* the 4D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_4d_tile_2d(
	pthreadpool_t threadpool,
	pthreadpool_task_4d_tile_2d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t tile_k,
	size_t tile_l,
	uint32_t flags);

	/**
	* Process items on a 4D grid with the specified maximum tile size along the
	* last two grid dimensions using a microarchitecture-aware task function.
	*
	* The function implements a parallel version of the following snippet:
	*
	* uint32_t uarch_index = cpuinfo_initialize() ?
	* cpuinfo_get_current_uarch_index() : default_uarch_index;
	* if (uarch_index > max_uarch_index) uarch_index = default_uarch_index;
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* function(context, uarch_index, i, j, k, l,
	* min(range_k - k, tile_k), min(range_l - l, tile_l));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If
	* threadpool is NULL, all items are processed serially on the calling
	* thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified
	* function.
	* @param default_uarch_index the microarchitecture index to use when
	* pthreadpool is configured without cpuinfo, cpuinfo initialization failed,
	* or index returned by cpuinfo_get_current_uarch_index() exceeds the
	* max_uarch_index value.
	* @param max_uarch_index the maximum microarchitecture index expected by
	* the specified function. If the index returned by
	* cpuinfo_get_current_uarch_index() exceeds this value, default_uarch_index
	* will be used instead. default_uarch_index can exceed max_uarch_index.
	* @param range_i the number of items to process along the first
	* dimension of the 4D grid.
	* @param range_j the number of items to process along the second
	* dimension of the 4D grid.
	* @param range_k the number of items to process along the third
	* dimension of the 4D grid.
	* @param range_l the number of items to process along the fourth
	* dimension of the 4D grid.
	* @param tile_k the maximum number of items along the third
	* dimension of the 4D grid to process in one function call.
	* @param tile_l the maximum number of items along the fourth
	* dimension of the 4D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional
	* flags (PTHREADPOOL_FLAG_DISABLE_DENORMALS or
	* PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_4d_tile_2d_with_uarch(
	pthreadpool_t threadpool,
	pthreadpool_task_4d_tile_2d_with_id_t function,
	void* context,
	uint32_t default_uarch_index,
	uint32_t max_uarch_index,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t tile_k,
	size_t tile_l,
	uint32_t flags);

	/**
	* Process items on a 5D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m++)
	* function(context, i, j, k, l, m);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 5D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 5D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 5D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 5D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 5D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_5d(
	pthreadpool_t threadpool,
	pthreadpool_task_5d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	uint32_t flags);

	/**
	* Process items on a 5D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m += tile_m)
	* function(context, i, j, k, l, m, min(range_m - m, tile_m));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 5D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 5D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 5D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 5D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 5D grid.
	* @param tile_m the maximum number of items along the fifth dimension of
	* the 5D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_5d_tile_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_5d_tile_1d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t tile_m,
	uint32_t flags);

	/**
	* Process items on a 5D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* for (size_t m = 0; m < range_m; m += tile_m)
	* function(context, i, j, k, l, m,
	* min(range_l - l, tile_l), min(range_m - m, tile_m));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 5D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 5D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 5D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 5D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 5D grid.
	* @param tile_l the maximum number of items along the fourth dimension of
	* the 5D grid to process in one function call.
	* @param tile_m the maximum number of items along the fifth dimension of
	* the 5D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_5d_tile_2d(
	pthreadpool_t threadpool,
	pthreadpool_task_5d_tile_2d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t tile_l,
	size_t tile_m,
	uint32_t flags);

	/**
	* Process items on a 6D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m++)
	* for (size_t n = 0; n < range_n; n++)
	* function(context, i, j, k, l, m, n);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 6D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 6D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 6D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 6D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 6D grid.
	* @param range_n the number of items to process along the sixth dimension
	* of the 6D grid.
	* @param tile_n the maximum number of items along the sixth dimension of
	* the 6D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_6d(
	pthreadpool_t threadpool,
	pthreadpool_task_6d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t range_n,
	uint32_t flags);

	/**
	* Process items on a 6D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m++)
	* for (size_t n = 0; n < range_n; n += tile_n)
	* function(context, i, j, k, l, m, n, min(range_n - n, tile_n));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 6D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 6D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 6D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 6D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 6D grid.
	* @param range_n the number of items to process along the sixth dimension
	* of the 6D grid.
	* @param tile_n the maximum number of items along the sixth dimension of
	* the 6D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_6d_tile_1d(
	pthreadpool_t threadpool,
	pthreadpool_task_6d_tile_1d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t range_n,
	size_t tile_n,
	uint32_t flags);

	/**
	* Process items on a 6D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m += tile_m)
	* for (size_t n = 0; n < range_n; n += tile_n)
	* function(context, i, j, k, l, m, n,
	* min(range_m - m, tile_m), min(range_n - n, tile_n));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param function the function to call for each tile.
	* @param context the first argument passed to the specified function.
	* @param range_i the number of items to process along the first dimension
	* of the 6D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 6D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 6D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 6D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 6D grid.
	* @param range_n the number of items to process along the sixth dimension
	* of the 6D grid.
	* @param tile_m the maximum number of items along the fifth dimension of
	* the 6D grid to process in one function call.
	* @param tile_n the maximum number of items along the sixth dimension of
	* the 6D grid to process in one function call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	void pthreadpool_parallelize_6d_tile_2d(
	pthreadpool_t threadpool,
	pthreadpool_task_6d_tile_2d_t function,
	void* context,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t range_n,
	size_t tile_m,
	size_t tile_n,
	uint32_t flags);

	/**
	* Terminates threads in the thread pool and releases associated resources.
	*
	* @warning Accessing the thread pool after a call to this function constitutes
	* undefined behaviour and may cause data corruption.
	*
	* @param[in,out] threadpool The thread pool to destroy.
	*/
	void pthreadpool_destroy(pthreadpool_t threadpool);

	#ifndef PTHREADPOOL_NO_DEPRECATED_API

	/* Legacy API for compatibility with pre-existing users (e.g. NNPACK) */
	#if defined(__GNUC__)
	#define PTHREADPOOL_DEPRECATED __attribute__((__deprecated__))
	#else
	#define PTHREADPOOL_DEPRECATED
	#endif

	typedef void (pthreadpool_function_1d_t)(void, size_t);
	typedef void (pthreadpool_function_1d_tiled_t)(void, size_t, size_t);
	typedef void (pthreadpool_function_2d_t)(void, size_t, size_t);
	typedef void (pthreadpool_function_2d_tiled_t)(void, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_function_3d_tiled_t)(void, size_t, size_t, size_t, size_t, size_t, size_t);
	typedef void (pthreadpool_function_4d_tiled_t)(void, size_t, size_t, size_t, size_t, size_t, size_t, size_t, size_t);

	void pthreadpool_compute_1d(
	pthreadpool_t threadpool,
	pthreadpool_function_1d_t function,
	void* argument,
	size_t range) PTHREADPOOL_DEPRECATED;

	void pthreadpool_compute_1d_tiled(
	pthreadpool_t threadpool,
	pthreadpool_function_1d_tiled_t function,
	void* argument,
	size_t range,
	size_t tile) PTHREADPOOL_DEPRECATED;

	void pthreadpool_compute_2d(
	pthreadpool_t threadpool,
	pthreadpool_function_2d_t function,
	void* argument,
	size_t range_i,
	size_t range_j) PTHREADPOOL_DEPRECATED;

	void pthreadpool_compute_2d_tiled(
	pthreadpool_t threadpool,
	pthreadpool_function_2d_tiled_t function,
	void* argument,
	size_t range_i,
	size_t range_j,
	size_t tile_i,
	size_t tile_j) PTHREADPOOL_DEPRECATED;

	void pthreadpool_compute_3d_tiled(
	pthreadpool_t threadpool,
	pthreadpool_function_3d_tiled_t function,
	void* argument,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t tile_i,
	size_t tile_j,
	size_t tile_k) PTHREADPOOL_DEPRECATED;

	void pthreadpool_compute_4d_tiled(
	pthreadpool_t threadpool,
	pthreadpool_function_4d_tiled_t function,
	void* argument,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t tile_i,
	size_t tile_j,
	size_t tile_k,
	size_t tile_l) PTHREADPOOL_DEPRECATED;

	#endif /* PTHREADPOOL_NO_DEPRECATED_API */

	#ifdef __cplusplus
	} /* extern "C" */
	#endif

	#ifdef __cplusplus

	namespace libpthreadpool {
	namespace detail {
	namespace {

	template<class T>
	void call_wrapper_1d(void* arg, size_t i) {
	(static_cast<const T>(arg))(i);
	}

	template<class T>
	void call_wrapper_1d_tile_1d(void* arg, size_t range_i, size_t tile_i) {
	(static_cast<const T>(arg))(range_i, tile_i);
	}

	template<class T>
	void call_wrapper_2d(void* functor, size_t i, size_t j) {
	(static_cast<const T>(functor))(i, j);
	}

	template<class T>
	void call_wrapper_2d_tile_1d(void* functor,
	size_t i, size_t range_j, size_t tile_j)
	{
	(static_cast<const T>(functor))(i, range_j, tile_j);
	}

	template<class T>
	void call_wrapper_2d_tile_2d(void* functor,
	size_t range_i, size_t range_j,
	size_t tile_i, size_t tile_j)
	{
	(static_cast<const T>(functor))(range_i, range_j, tile_i, tile_j);
	}

	template<class T>
	void call_wrapper_3d(void* functor, size_t i, size_t j, size_t k) {
	(static_cast<const T>(functor))(i, j, k);
	}

	template<class T>
	void call_wrapper_3d_tile_1d(void* functor,
	size_t i, size_t j, size_t range_k,
	size_t tile_k)
	{
	(static_cast<const T>(functor))(i, j, range_k, tile_k);
	}

	template<class T>
	void call_wrapper_3d_tile_2d(void* functor,
	size_t i, size_t range_j, size_t range_k,
	size_t tile_j, size_t tile_k)
	{
	(static_cast<const T>(functor))(i, range_j, range_k, tile_j, tile_k);
	}

	template<class T>
	void call_wrapper_4d(void* functor, size_t i, size_t j, size_t k, size_t l) {
	(static_cast<const T>(functor))(i, j, k, l);
	}

	template<class T>
	void call_wrapper_4d_tile_1d(void* functor,
	size_t i, size_t j, size_t k, size_t range_l,
	size_t tile_l)
	{
	(static_cast<const T>(functor))(i, j, k, range_l, tile_l);
	}

	template<class T>
	void call_wrapper_4d_tile_2d(void* functor,
	size_t i, size_t j, size_t range_k, size_t range_l,
	size_t tile_k, size_t tile_l)
	{
	(static_cast<const T>(functor))(i, j, range_k, range_l, tile_k, tile_l);
	}

	template<class T>
	void call_wrapper_5d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m) {
	(static_cast<const T>(functor))(i, j, k, l, m);
	}

	template<class T>
	void call_wrapper_5d_tile_1d(void* functor,
	size_t i, size_t j, size_t k, size_t l, size_t range_m,
	size_t tile_m)
	{
	(static_cast<const T>(functor))(i, j, k, l, range_m, tile_m);
	}

	template<class T>
	void call_wrapper_5d_tile_2d(void* functor,
	size_t i, size_t j, size_t k, size_t range_l, size_t range_m,
	size_t tile_l, size_t tile_m)
	{
	(static_cast<const T>(functor))(i, j, k, range_l, range_m, tile_l, tile_m);
	}

	template<class T>
	void call_wrapper_6d(void* functor, size_t i, size_t j, size_t k, size_t l, size_t m, size_t n) {
	(static_cast<const T>(functor))(i, j, k, l, m, n);
	}

	template<class T>
	void call_wrapper_6d_tile_1d(void* functor,
	size_t i, size_t j, size_t k, size_t l, size_t m, size_t range_n,
	size_t tile_n)
	{
	(static_cast<const T>(functor))(i, j, k, l, m, range_n, tile_n);
	}

	template<class T>
	void call_wrapper_6d_tile_2d(void* functor,
	size_t i, size_t j, size_t k, size_t l, size_t range_m, size_t range_n,
	size_t tile_m, size_t tile_n)
	{
	(static_cast<const T>(functor))(i, j, k, l, range_m, range_n, tile_m, tile_n);
	}

	} /* namespace */
	} /* namespace detail */
	} /* namespace libpthreadpool */

	/**
	* Process items on a 1D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range; i++)
	* functor(i);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each item.
	* @param range the number of items on the 1D grid to process. The
	* specified functor will be called once for each item.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range,
	flags);
	}

	/**
	* Process items on a 1D grid with specified maximum tile size.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range; i += tile)
	* functor(i, min(range - i, tile));
	*
	* When the call returns, all items have been processed and the thread pool is
	* ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool,
	* the calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range the number of items on the 1D grid to process.
	* @param tile the maximum number of items on the 1D grid to process in
	* one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_1d_tile_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range,
	size_t tile,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_1d_tile_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_1d_tile_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range,
	tile,
	flags);
	}

	/**
	* Process items on a 2D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* functor(i, j);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each item.
	* @param range_i the number of items to process along the first dimension
	* of the 2D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 2D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_2d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_2d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_2d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	flags);
	}

	/**
	* Process items on a 2D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* functor(i, j, min(range_j - j, tile_j));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 2D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 2D grid.
	* @param tile_j the maximum number of items along the second dimension of
	* the 2D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_2d_tile_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t tile_j,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_2d_tile_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_2d_tile_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	tile_j,
	flags);
	}

	/**
	* Process items on a 2D grid with the specified maximum tile size along each
	* grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i += tile_i)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* functor(i, j,
	* min(range_i - i, tile_i), min(range_j - j, tile_j));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 2D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 2D grid.
	* @param tile_j the maximum number of items along the first dimension of
	* the 2D grid to process in one functor call.
	* @param tile_j the maximum number of items along the second dimension of
	* the 2D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_2d_tile_2d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t tile_i,
	size_t tile_j,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_2d_tile_2d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_2d_tile_2d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	tile_i,
	tile_j,
	flags);
	}

	/**
	* Process items on a 3D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* functor(i, j, k);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 3D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 3D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 3D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_3d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_3d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_3d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	flags);
	}

	/**
	* Process items on a 3D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* functor(i, j, k, min(range_k - k, tile_k));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 3D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 3D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 3D grid.
	* @param tile_k the maximum number of items along the third dimension of
	* the 3D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_3d_tile_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t tile_k,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_3d_tile_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_3d_tile_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	tile_k,
	flags);
	}

	/**
	* Process items on a 3D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j += tile_j)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* functor(i, j, k,
	* min(range_j - j, tile_j), min(range_k - k, tile_k));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 3D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 3D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 3D grid.
	* @param tile_j the maximum number of items along the second dimension of
	* the 3D grid to process in one functor call.
	* @param tile_k the maximum number of items along the third dimension of
	* the 3D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_3d_tile_2d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t tile_j,
	size_t tile_k,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_3d_tile_2d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_3d_tile_2d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	tile_j,
	tile_k,
	flags);
	}

	/**
	* Process items on a 4D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* functor(i, j, k, l);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 4D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 4D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 4D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 4D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_4d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_4d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_4d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	flags);
	}

	/**
	* Process items on a 4D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* functor(i, j, k, l, min(range_l - l, tile_l));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 4D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 4D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 4D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 4D grid.
	* @param tile_l the maximum number of items along the fourth dimension of
	* the 4D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_4d_tile_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t tile_l,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_4d_tile_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_4d_tile_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	tile_l,
	flags);
	}

	/**
	* Process items on a 4D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k += tile_k)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* functor(i, j, k, l,
	* min(range_k - k, tile_k), min(range_l - l, tile_l));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 4D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 4D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 4D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 4D grid.
	* @param tile_k the maximum number of items along the third dimension of
	* the 4D grid to process in one functor call.
	* @param tile_l the maximum number of items along the fourth dimension of
	* the 4D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_4d_tile_2d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t tile_k,
	size_t tile_l,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_4d_tile_2d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_4d_tile_2d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	tile_k,
	tile_l,
	flags);
	}

	/**
	* Process items on a 5D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m++)
	* functor(i, j, k, l, m);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 5D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 5D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 5D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 5D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 5D grid.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_5d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_5d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_5d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	range_m,
	flags);
	}

	/**
	* Process items on a 5D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m += tile_m)
	* functor(i, j, k, l, m, min(range_m - m, tile_m));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 5D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 5D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 5D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 5D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 5D grid.
	* @param tile_m the maximum number of items along the fifth dimension of
	* the 5D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_5d_tile_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t tile_m,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_5d_tile_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_5d_tile_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	range_m,
	tile_m,
	flags);
	}

	/**
	* Process items on a 5D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l += tile_l)
	* for (size_t m = 0; m < range_m; m += tile_m)
	* functor(i, j, k, l, m,
	* min(range_l - l, tile_l), min(range_m - m, tile_m));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 5D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 5D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 5D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 5D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 5D grid.
	* @param tile_l the maximum number of items along the fourth dimension of
	* the 5D grid to process in one functor call.
	* @param tile_m the maximum number of items along the fifth dimension of
	* the 5D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_5d_tile_2d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t tile_l,
	size_t tile_m,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_5d_tile_2d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_5d_tile_2d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	range_m,
	tile_l,
	tile_m,
	flags);
	}

	/**
	* Process items on a 6D grid.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m++)
	* for (size_t n = 0; n < range_n; n++)
	* functor(i, j, k, l, m, n);
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 6D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 6D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 6D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 6D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 6D grid.
	* @param range_n the number of items to process along the sixth dimension
	* of the 6D grid.
	* @param tile_n the maximum number of items along the sixth dimension of
	* the 6D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_6d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t range_n,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_6d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_6d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	range_m,
	range_n,
	flags);
	}

	/**
	* Process items on a 6D grid with the specified maximum tile size along the
	* last grid dimension.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m++)
	* for (size_t n = 0; n < range_n; n += tile_n)
	* functor(i, j, k, l, m, n, min(range_n - n, tile_n));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 6D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 6D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 6D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 6D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 6D grid.
	* @param range_n the number of items to process along the sixth dimension
	* of the 6D grid.
	* @param tile_n the maximum number of items along the sixth dimension of
	* the 6D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_6d_tile_1d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t range_n,
	size_t tile_n,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_6d_tile_1d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_6d_tile_1d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	range_m,
	range_n,
	tile_n,
	flags);
	}

	/**
	* Process items on a 6D grid with the specified maximum tile size along the
	* last two grid dimensions.
	*
	* The function implements a parallel version of the following snippet:
	*
	* for (size_t i = 0; i < range_i; i++)
	* for (size_t j = 0; j < range_j; j++)
	* for (size_t k = 0; k < range_k; k++)
	* for (size_t l = 0; l < range_l; l++)
	* for (size_t m = 0; m < range_m; m += tile_m)
	* for (size_t n = 0; n < range_n; n += tile_n)
	* functor(i, j, k, l, m, n,
	* min(range_m - m, tile_m), min(range_n - n, tile_n));
	*
	* When the function returns, all items have been processed and the thread pool
	* is ready for a new task.
	*
	* @note If multiple threads call this function with the same thread pool, the
	* calls are serialized.
	*
	* @param threadpool the thread pool to use for parallelisation. If threadpool
	* is NULL, all items are processed serially on the calling thread.
	* @param functor the functor to call for each tile.
	* @param range_i the number of items to process along the first dimension
	* of the 6D grid.
	* @param range_j the number of items to process along the second dimension
	* of the 6D grid.
	* @param range_k the number of items to process along the third dimension
	* of the 6D grid.
	* @param range_l the number of items to process along the fourth dimension
	* of the 6D grid.
	* @param range_m the number of items to process along the fifth dimension
	* of the 6D grid.
	* @param range_n the number of items to process along the sixth dimension
	* of the 6D grid.
	* @param tile_m the maximum number of items along the fifth dimension of
	* the 6D grid to process in one functor call.
	* @param tile_n the maximum number of items along the sixth dimension of
	* the 6D grid to process in one functor call.
	* @param flags a bitwise combination of zero or more optional flags
	* (PTHREADPOOL_FLAG_DISABLE_DENORMALS or PTHREADPOOL_FLAG_YIELD_WORKERS)
	*/
	template<class T>
	inline void pthreadpool_parallelize_6d_tile_2d(
	pthreadpool_t threadpool,
	const T& functor,
	size_t range_i,
	size_t range_j,
	size_t range_k,
	size_t range_l,
	size_t range_m,
	size_t range_n,
	size_t tile_m,
	size_t tile_n,
	uint32_t flags = 0)
	{
	pthreadpool_parallelize_6d_tile_2d(
	threadpool,
	&libpthreadpool::detail::call_wrapper_6d_tile_2d<const T>,
	const_cast<void>(static_cast<const void>(&functor)),
	range_i,
	range_j,
	range_k,
	range_l,
	range_m,
	range_n,
	tile_m,
	tile_n,
	flags);
	}

	#endif /* __cplusplus */

	#endif /* PTHREADPOOL_H_ */