| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | #ifndef KERNEL_MAP_CUH |
| | #define KERNEL_MAP_CUH |
| |
|
| | #include "3rdparty/hash/hash_allocator.cuh" |
| |
|
| | #include "coordinate_map_functors.cuh" |
| | #include "storage.cuh" |
| | #include "types.hpp" |
| |
|
| | #include <functional> |
| | #include <map> |
| | #include <memory> |
| |
|
| | #include <thrust/copy.h> |
| | #include <thrust/execution_policy.h> |
| | #include <thrust/iterator/constant_iterator.h> |
| | #include <thrust/iterator/counting_iterator.h> |
| | #include <thrust/sort.h> |
| |
|
| | namespace minkowski { |
| |
|
| | template <typename index_type, typename ByteAllocator> class gpu_kernel_map { |
| | public: |
| | using size_type = default_types::size_type; |
| | using byte_allocator_type = ByteAllocator; |
| | using self_type = gpu_kernel_map<index_type, ByteAllocator>; |
| |
|
| | class contiguous_memory { |
| | public: |
| | using self_type = contiguous_memory; |
| |
|
| | public: |
| | contiguous_memory() = delete; |
| | contiguous_memory(gpu_kernel_map &kernel_map) : m_kernel_map{kernel_map} {} |
| |
|
| | self_type &operator=(self_type const &other) { |
| | m_data = other.m_data; |
| | m_kernel_map = other.m_kernel_map; |
| | return *this; |
| | } |
| |
|
| | inline typename std::map<index_type, index_type>::const_iterator |
| | key_cbegin() const { |
| | return m_kernel_map.m_kernel_offset_map.cbegin(); |
| | } |
| | inline typename std::map<index_type, index_type>::const_iterator |
| | key_cend() const { |
| | return m_kernel_map.m_kernel_offset_map.cend(); |
| | } |
| |
|
| | inline index_type *data() { return m_data; } |
| | inline index_type const *cdata() const { return m_data; } |
| | inline void data(index_type *p_data) { m_data = p_data; } |
| |
|
| | inline index_type *begin() const { return m_data; } |
| | inline index_type *begin(index_type kernel_index) const { |
| | auto const &offset_map = m_kernel_map.m_kernel_offset_map; |
| | auto const iter = offset_map.find(kernel_index); |
| | if (iter == offset_map.end()) { |
| | |
| | return m_data; |
| | } else { |
| | return m_data + iter->second; |
| | } |
| | } |
| |
|
| | inline index_type *end() const { return m_data + m_kernel_map.m_capacity; } |
| | inline index_type *end(index_type kernel_index) const { |
| | auto const &offset_map = m_kernel_map.m_kernel_offset_map; |
| | auto const iter = offset_map.find(kernel_index); |
| | if (iter == offset_map.end()) { |
| | |
| | return m_data + m_kernel_map.m_capacity; |
| | } else { |
| | return m_data + iter->second + |
| | m_kernel_map.m_kernel_size_map[kernel_index]; |
| | } |
| | } |
| |
|
| | size_type size(index_type kernel_index) const { |
| | auto const &size_map = m_kernel_map.m_kernel_size_map; |
| | auto const iter = size_map.find(kernel_index); |
| | if (iter == size_map.end()) |
| | return 0; |
| | return iter->second; |
| | } |
| |
|
| | private: |
| | index_type *m_data; |
| | gpu_kernel_map &m_kernel_map; |
| | }; |
| |
|
| | public: |
| | gpu_kernel_map() |
| | : m_memory_size_byte{0}, |
| | m_capacity{0}, kernels{*this}, in_maps{*this}, out_maps{*this} { |
| | LOG_DEBUG("Initialized gpu_kernel_map"); |
| | } |
| | gpu_kernel_map(self_type const &other) |
| | : m_decomposed(other.m_decomposed), |
| | m_requires_kernel_index(other.m_requires_kernel_index), |
| | m_memory_size_byte(other.m_memory_size_byte), |
| | m_capacity{other.m_capacity}, |
| | m_in_map_memory{other.m_in_map_memory}, |
| | m_out_map_memory{other.m_out_map_memory}, |
| | m_allocator{other.m_allocator}, |
| | m_kernel_size_map{other.m_kernel_size_map}, |
| | m_kernel_offset_map{other.m_kernel_offset_map}, |
| | kernels{*this}, |
| | in_maps{*this}, |
| | out_maps{*this} { |
| | LOG_DEBUG("gpu_kernel_map copy constructor"); |
| | in_maps.data(other.in_maps.begin()); |
| | out_maps.data(other.out_maps.begin()); |
| | if (m_requires_kernel_index) { |
| | m_kernel_index_memory = other.m_kernel_index_memory; |
| | kernels.data(other.kernels.begin()); |
| | } |
| | } |
| |
|
| | gpu_kernel_map(size_type capacity, |
| | byte_allocator_type alloc = byte_allocator_type(), |
| | bool requires_kernel_index = true) |
| | : m_requires_kernel_index(requires_kernel_index), m_capacity{capacity}, |
| | m_allocator{alloc}, kernels{*this}, in_maps{*this}, out_maps{*this} { |
| | |
| | m_memory_size_byte = capacity * sizeof(index_type); |
| | index_type *ptr_in_map = reinterpret_cast<index_type *>( |
| | m_allocator.allocate(m_memory_size_byte)); |
| | index_type *ptr_out_map = reinterpret_cast<index_type *>( |
| | m_allocator.allocate(m_memory_size_byte)); |
| | index_type *ptr_kernel = nullptr; |
| |
|
| | auto deleter = [](index_type *p, byte_allocator_type alloc, |
| | size_type size) { |
| | alloc.deallocate(reinterpret_cast<char *>(p), size); |
| | LOG_DEBUG("Deallocate kernel map"); |
| | }; |
| |
|
| | m_in_map_memory = std::shared_ptr<index_type[]>{ |
| | ptr_in_map, std::bind(deleter, std::placeholders::_1, m_allocator, |
| | m_memory_size_byte)}; |
| | m_out_map_memory = std::shared_ptr<index_type[]>{ |
| | ptr_out_map, std::bind(deleter, std::placeholders::_1, m_allocator, |
| | m_memory_size_byte)}; |
| | |
| | in_maps.data(m_in_map_memory.get()); |
| | out_maps.data(m_out_map_memory.get()); |
| |
|
| | if (requires_kernel_index) { |
| | ptr_kernel = reinterpret_cast<index_type *>( |
| | m_allocator.allocate(m_memory_size_byte)); |
| | m_kernel_index_memory = std::shared_ptr<index_type[]>{ |
| | ptr_kernel, std::bind(deleter, std::placeholders::_1, m_allocator, |
| | m_memory_size_byte)}; |
| | kernels.data(m_kernel_index_memory.get()); |
| | } else { |
| | m_kernel_offset_map[0] = 0; |
| | m_kernel_size_map[0] = capacity; |
| | |
| | m_decomposed = true; |
| | } |
| | } |
| |
|
| | self_type swap() const { |
| | self_type swapped_gpu_kernel_map(*this); |
| | swapped_gpu_kernel_map.in_maps.data( |
| | swapped_gpu_kernel_map.m_out_map_memory.get()); |
| | swapped_gpu_kernel_map.out_maps.data( |
| | swapped_gpu_kernel_map.m_in_map_memory.get()); |
| |
|
| | #ifdef DEBUG |
| | size_type map_size = std::min<size_type>(in_maps.size(0), 100); |
| |
|
| | index_type *p_kernel_map = |
| | (index_type *)std::malloc(map_size * 3 * sizeof(index_type)); |
| | |
| | |
| | |
| | CUDA_CHECK(cudaMemcpy(p_kernel_map + 1 * map_size, in_maps.begin(), |
| | map_size * sizeof(index_type), |
| | cudaMemcpyDeviceToHost)); |
| | CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, out_maps.begin(), |
| | map_size * sizeof(index_type), |
| | cudaMemcpyDeviceToHost)); |
| |
|
| | for (index_type i = 0; i < map_size; ++i) { |
| | std::cout |
| | << p_kernel_map[i + 1 * map_size] << "->" |
| | << p_kernel_map[i + 2 * map_size] << "\n"; |
| | } |
| |
|
| | std::cout << "Swapped kernel map\n"; |
| |
|
| | |
| | |
| | |
| | |
| | CUDA_CHECK(cudaMemcpy( |
| | p_kernel_map + 1 * map_size, swapped_gpu_kernel_map.in_maps.begin(), |
| | map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); |
| | CUDA_CHECK(cudaMemcpy( |
| | p_kernel_map + 2 * map_size, swapped_gpu_kernel_map.out_maps.begin(), |
| | map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); |
| |
|
| | for (index_type i = 0; i < map_size; ++i) { |
| | std::cout |
| | << p_kernel_map[i + 1 * map_size] << "->" |
| | << p_kernel_map[i + 2 * map_size] << "\n"; |
| | } |
| | CUDA_CHECK(cudaDeviceSynchronize()); |
| | std::free(p_kernel_map); |
| | #endif |
| | return swapped_gpu_kernel_map; |
| | } |
| |
|
| | self_type &operator=(self_type const &other) { |
| | m_decomposed = other.m_decomposed; |
| | m_requires_kernel_index = other.m_requires_kernel_index; |
| |
|
| | m_memory_size_byte = other.m_memory_size_byte; |
| | m_capacity = other.m_capacity; |
| |
|
| | m_kernel_index_memory = other.m_kernel_index_memory; |
| | m_in_map_memory = other.m_in_map_memory; |
| | m_out_map_memory = other.m_out_map_memory; |
| | m_allocator = other.m_allocator; |
| |
|
| | m_kernel_size_map = other.m_kernel_size_map; |
| | m_kernel_offset_map = other.m_kernel_offset_map; |
| |
|
| | in_maps.data(other.in_maps.begin()); |
| | out_maps.data(other.out_maps.begin()); |
| | kernels.data(other.kernels.begin()); |
| |
|
| | return *this; |
| | } |
| |
|
| | |
| | inline typename std::map<index_type, index_type>::const_iterator |
| | key_cbegin() const { |
| | return m_kernel_offset_map.cbegin(); |
| | } |
| |
|
| | inline typename std::map<index_type, index_type>::const_iterator |
| | key_cend() const { |
| | return m_kernel_offset_map.cend(); |
| | } |
| |
|
| | size_type size() const { |
| | size_type nmap = 0; |
| | for (auto const &k : m_kernel_size_map) { |
| | nmap += k.second; |
| | } |
| | return nmap; |
| | } |
| |
|
| | size_type volume() const { return m_kernel_size_map.size(); } |
| |
|
| | size_type max_size() const { |
| | size_type nmap = 0; |
| | for (auto const &k : m_kernel_size_map) { |
| | if (k.second > nmap) |
| | nmap = k.second; |
| | } |
| | return nmap; |
| | } |
| |
|
| | size_type size(index_type const kernel_index) const { |
| | auto const iter = m_kernel_size_map.find(kernel_index); |
| | if (iter == m_kernel_size_map.end()) |
| | return 0; |
| | return iter->second; |
| | } |
| |
|
| | std::string to_string() const { |
| | Formatter o; |
| | size_type map_size = 0; |
| | for (auto const &kv : m_kernel_size_map) { |
| | map_size += kv.second; |
| | } |
| | o << "gpu_kernel_map: number of unique maps:" << m_kernel_size_map.size() |
| | << ", kernel map size:" << map_size; |
| | return o.str(); |
| | } |
| |
|
| | void decompose() { |
| | LOG_DEBUG("Decomposing", kernels.end() - kernels.begin(), "elements"); |
| | |
| | |
| | THRUST_CHECK(thrust::sort_by_key(thrust::device, |
| | kernels.begin(), |
| | kernels.end(), |
| | thrust::make_zip_iterator( |
| | thrust::make_tuple( |
| | in_maps.begin(), |
| | out_maps.begin() |
| | ) |
| | ))); |
| |
|
| | #ifdef DEBUG |
| | size_type map_size = |
| | std::min<size_type>(in_maps.end() - in_maps.begin(), 100); |
| | LOG_DEBUG("printing", map_size, "kernel maps"); |
| | index_type *p_kernel_map = |
| | (index_type *)std::malloc(map_size * 3 * sizeof(index_type)); |
| | CUDA_CHECK(cudaMemcpy(p_kernel_map, m_kernel_index_memory.get(), |
| | map_size * sizeof(index_type), |
| | cudaMemcpyDeviceToHost)); |
| | CUDA_CHECK(cudaMemcpy(p_kernel_map + map_size, m_in_map_memory.get(), |
| | map_size * sizeof(index_type), |
| | cudaMemcpyDeviceToHost)); |
| | CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, m_out_map_memory.get(), |
| | map_size * sizeof(index_type), |
| | cudaMemcpyDeviceToHost)); |
| |
|
| | for (index_type i = 0; i < map_size; ++i) { |
| | std::cout << p_kernel_map[i + 0 * map_size] << ":" |
| | << p_kernel_map[i + 1 * map_size] << "->" |
| | << p_kernel_map[i + 2 * map_size] << "\n"; |
| | } |
| | std::free(p_kernel_map); |
| | #endif |
| |
|
| | |
| | |
| | thrust::counting_iterator<index_type> min_begin{0}; |
| | thrust::constant_iterator<index_type> size_begin{1}; |
| |
|
| | gpu_storage<index_type, byte_allocator_type> out_key(m_capacity); |
| | gpu_storage<index_type, byte_allocator_type> out_key_min(m_capacity); |
| | gpu_storage<index_type, byte_allocator_type> out_key_size(m_capacity); |
| |
|
| | size_type num_unique_keys; |
| |
|
| | try { |
| | auto end = thrust::reduce_by_key( |
| | thrust::device, |
| | kernels.begin(), |
| | kernels.end(), |
| | thrust::make_zip_iterator( |
| | thrust::make_tuple(min_begin, size_begin)), |
| | out_key.begin(), |
| | thrust::make_zip_iterator(thrust::make_tuple( |
| | out_key_min.begin(), out_key_size.begin())), |
| | thrust::equal_to<index_type>(), |
| | detail::min_size_functor<index_type>() |
| | ); |
| | num_unique_keys = end.first - out_key.begin(); |
| | LOG_DEBUG(num_unique_keys, "unique kernel map keys found"); |
| | } |
| | THRUST_CATCH; |
| |
|
| | auto const cpu_out_keys = out_key.to_vector(num_unique_keys); |
| | auto const cpu_out_offset = out_key_min.to_vector(num_unique_keys); |
| | auto const cpu_out_size = out_key_size.to_vector(num_unique_keys); |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #ifdef DEBUG |
| | LOG_DEBUG("Printing cpu keys"); |
| | LOG_DEBUG("Keys:", cpu_out_keys); |
| | LOG_DEBUG("Mins:", cpu_out_offset); |
| | LOG_DEBUG("Size:", cpu_out_size); |
| | #endif |
| |
|
| | |
| | for (index_type i = 0; i < num_unique_keys; ++i) { |
| | m_kernel_offset_map[cpu_out_keys[i]] = cpu_out_offset[i]; |
| | m_kernel_size_map[cpu_out_keys[i]] = cpu_out_size[i]; |
| | } |
| |
|
| | |
| | m_decomposed = true; |
| | } |
| |
|
| | friend std::ostream &operator<<(std::ostream &out, |
| | self_type const &kernel_map) { |
| | out << kernel_map.to_string(); |
| | return out; |
| | } |
| |
|
| | private: |
| | bool m_decomposed{false}; |
| | bool m_requires_kernel_index; |
| | size_type m_memory_size_byte, m_capacity; |
| | std::shared_ptr<index_type[]> m_kernel_index_memory; |
| | std::shared_ptr<index_type[]> m_in_map_memory; |
| | std::shared_ptr<index_type[]> m_out_map_memory; |
| | byte_allocator_type m_allocator; |
| |
|
| | std::map<index_type, index_type> m_kernel_size_map; |
| | std::map<index_type, index_type> m_kernel_offset_map; |
| |
|
| | public: |
| | contiguous_memory kernels; |
| | contiguous_memory in_maps; |
| | contiguous_memory out_maps; |
| | }; |
| |
|
| | } |
| |
|
| | #endif |
| |
|