File size: 3,857 Bytes
f5bb0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#ifndef OPENPOSE_PRIVATE_UTILITIES_AVX_HPP
#define OPENPOSE_PRIVATE_UTILITIES_AVX_HPP

// Warning:
// This file contains auxiliary functions for AVX.
// This file should only be included from cpp files.
// Default #include <openpose/headers.hpp> does not include it.

#ifdef WITH_AVX
    #include <cstdint> // uintptr_t
    #include <memory> // shared_ptr
    #include <immintrin.h>
    #include <openpose/utilities/errorAndLog.hpp>

    namespace op
    {
        #ifdef __GNUC__
            #define ALIGN32(x) x __attribute__((aligned(32)))
        #elif defined(_MSC_VER) // defined(_WIN32)
            #define ALIGN32(x) __declspec(align(32))
        #else
            #error Unknown environment!
        #endif

        // Functions
        // Sources:
        // - https://stackoverflow.com/questions/32612190/how-to-solve-the-32-byte-alignment-issue-for-avx-load-store-operations
        // - https://embeddedartistry.com/blog/2017/2/20/implementing-aligned-malloc
        // - https://embeddedartistry.com/blog/2017/2/23/c-smart-pointers-with-aligned-mallocfree
        typedef unsigned long long offset_t;
        #define PTR_OFFSET_SZ sizeof(offset_t)
        #ifndef align_up
        #define align_up(num, align) \
            (((num) + ((align) - 1)) & ~((align) - 1))
        #endif
        inline void * aligned_malloc(const size_t align, const size_t size)
        {
            void * ptr = nullptr;

            // 2 conditions:
            //  - We want both align and size to be greater than 0
            //  - We want it to be a power of two since align_up operates on powers of two
            if (align && size && (align & (align - 1)) == 0)
            {
                // We know we have to fit an offset value
                // We also allocate extra bytes to ensure we can meet the alignment
                const auto hdr_size = PTR_OFFSET_SZ + (align - 1);
                void * p = malloc(size + hdr_size);

                if (p)
                {
                    // Add the offset size to malloc's pointer (we will always store that)
                    // Then align the resulting value to the arget alignment
                    ptr = (void *) align_up(((uintptr_t)p + PTR_OFFSET_SZ), align);

                    // Calculate the offset and store it behind our aligned pointer
                    *((offset_t *)ptr - 1) = (offset_t)((uintptr_t)ptr - (uintptr_t)p);

                } // else nullptr, could not malloc
            } // else nullptr, invalid arguments

            if (ptr == nullptr)
            {
                error("Shared pointer could not be allocated for Array data storage.",
                      __LINE__, __FUNCTION__, __FILE__);
            }

            return ptr;
        }
        inline void aligned_free(void * ptr)
        {
            if (ptr == nullptr)
                error("Received nullptr.", __LINE__, __FUNCTION__, __FILE__);

            // Walk backwards from the passed-in pointer to get the pointer offset
            // We convert to an offset_t pointer and rely on pointer math to get the data
            offset_t offset = *((offset_t *)ptr - 1);

            // Once we have the offset, we can get our original pointer and call free
            void * p = (void *)((uint8_t *)ptr - offset);
            free(p);
        }
        template<class T>
        std::shared_ptr<T> aligned_shared_ptr(const size_t size)
        {
            try
            {
                return std::shared_ptr<T>(static_cast<T*>(
                    aligned_malloc(8*sizeof(T), sizeof(T)*size)), &aligned_free);
            }
            catch (const std::exception& e)
            {
                error(e.what(), __LINE__, __FUNCTION__, __FILE__);
                return std::shared_ptr<T>{};
            }
        }
    }
#endif

#endif // OPENPOSE_PRIVATE_UTILITIES_AVX_HPP