File size: 3,971 Bytes
28958dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#pragma once

#include "diffvg.h"
#include "vector.h"
#include "matrix.h"

// https://stackoverflow.com/questions/39274472/error-function-atomicadddouble-double-has-already-been-defined
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
#else
static inline DEVICE double atomicAdd(double *address, double val) {
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    if (val == 0.0)
        return __longlong_as_double(old);
    do {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
    } while (assumed != old);
    return __longlong_as_double(old);
}
#endif

#ifndef WIN32
    template <typename T0, typename T1>
    DEVICE
    inline T0 atomic_add_(T0 &target, T1 source) {
    #ifdef __CUDA_ARCH__
        return atomicAdd(&target, (T0)source);
    #else
        T0 old_val;
        T0 new_val;
        do {
            old_val = target;
            new_val = old_val + source;
        } while (!__atomic_compare_exchange(&target, &old_val, &new_val, true,
            std::memory_order::memory_order_seq_cst,
            std::memory_order::memory_order_seq_cst));
        return old_val;
    #endif
    }

    DEVICE
    inline
    float atomic_add(float &target, float source) {
        return atomic_add_(target, source);
    }
    DEVICE
    inline
    double atomic_add(double &target, double source) {
        return atomic_add_(target, source);
    }
#else
	float win_atomic_add(float &target, float source);
	double win_atomic_add(double &target, double source);
    DEVICE
    static float atomic_add(float &target, float source) {
    #ifdef __CUDA_ARCH__
        return atomicAdd(&target, source);
    #else
		return win_atomic_add(target, source);
    #endif
    }
    DEVICE
    static double atomic_add(double &target, double source) {
    #ifdef __CUDA_ARCH__
        return atomicAdd(&target, (double)source);
    #else
		return win_atomic_add(target, source);
    #endif
    }
#endif

template <typename T0, typename T1>
DEVICE
inline T0 atomic_add(T0 *target, T1 source) {
    return atomic_add(*target, (T0)source);
}

template <typename T0, typename T1>
DEVICE
inline TVector2<T0> atomic_add(TVector2<T0> &target, const TVector2<T1> &source) {
    atomic_add(target[0], source[0]);
    atomic_add(target[1], source[1]);
    return target;
}

template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TVector2<T1> &source) {
    atomic_add(target[0], (T0)source[0]);
    atomic_add(target[1], (T0)source[1]);
}

template <typename T0, typename T1>
DEVICE
inline TVector3<T0> atomic_add(TVector3<T0> &target, const TVector3<T1> &source) {
    atomic_add(target[0], source[0]);
    atomic_add(target[1], source[1]);
    atomic_add(target[2], source[2]);
    return target;
}

template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TVector3<T1> &source) {
    atomic_add(target[0], (T0)source[0]);
    atomic_add(target[1], (T0)source[1]);
    atomic_add(target[2], (T0)source[2]);
}

template <typename T0, typename T1>
DEVICE
inline TVector4<T0> atomic_add(TVector4<T0> &target, const TVector4<T1> &source) {
    atomic_add(target[0], source[0]);
    atomic_add(target[1], source[1]);
    atomic_add(target[2], source[2]);
    atomic_add(target[3], source[3]);
    return target;
}

template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TVector4<T1> &source) {
    atomic_add(target[0], (T0)source[0]);
    atomic_add(target[1], (T0)source[1]);
    atomic_add(target[2], (T0)source[2]);
    atomic_add(target[3], (T0)source[3]);
}

template <typename T0, typename T1>
DEVICE
inline void atomic_add(T0 *target, const TMatrix3x3<T1> &source) {
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            atomic_add(target[3 * i + j], (T0)source(i, j));
        }
    }
}