7 #ifndef _XMP_GPU_RUNTIME_FUNC_DECL
8 #define _XMP_GPU_RUNTIME_FUNC_DECL
15 #define _XMP_GPU_M_GTOL(_desc, _dim) \
16 (((_XMP_gpu_array_t *)_desc)[_dim].gtol)
17 #define _XMP_GPU_M_ACC(_desc, _dim) \
18 (((_XMP_gpu_array_t *)_desc)[_dim].acc)
22 #define _XMP_M_CEILi(a_, b_) (((a_) % (b_)) == 0 ? ((a_) / (b_)) : ((a_) / (b_)) + 1)
24 #define _XMP_M_FLOORi(a_, b_) ((a_) / (b_))
25 #define _XMP_M_COUNT_TRIPLETi(l_, u_, s_) (_XMP_M_FLOORi(((u_) - (l_)), s_) + 1)
28 #define _XMP_GPU_M_BARRIER_THREADS() __syncthreads()
29 #define _XMP_GPU_M_BARRIER_KERNEL() cudaThreadSynchronize()
32 #define _XMP_GPU_M_GET_ARRAY_GTOL(_gtol, _desc, _dim) \
33 _gtol = _XMP_GPU_M_GTOL(_desc, _dim)
34 #define _XMP_GPU_M_GET_ARRAY_ACC(_acc, _desc, _dim) \
35 _acc = _XMP_GPU_M_ACC(_desc, _dim)
47 *index = threadIdx.x +
48 (threadIdx.y * blockDim.x) +
49 (threadIdx.z * blockDim.x * blockDim.y) +
51 (blockIdx.y * gridDim.x) +
52 (blockIdx.z * gridDim.x * gridDim.y)) * (blockDim.x * blockDim.y * blockDim.z));
57 T lower0, T upper0, T stride0,
59 *iter0 = lower0 + (tid * stride0);
64 T lower0, T upper0, T stride0,
65 T lower1, T upper1, T stride1,
70 *iter0 = lower0 + ((tid % count0) * stride0);
71 *iter1 = lower1 + ((tid / count0) * stride1);
76 T lower0, T upper0, T stride0,
77 T lower1, T upper1, T stride1,
78 T lower2, T upper2, T stride2,
85 T temp1 = tid / count0;
86 *iter0 = lower0 + ((tid % count0) * stride0);
87 *iter1 = lower1 + ((temp1 % count1) * stride1);
88 *iter2 = lower2 + ((temp1 / count1) * stride2);
91 #define _XMP_gpu_calc_iter_MAP_THREADS_1(_l0, _u0, _s0, _i0) \
93 if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
95 _i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
98 #define _XMP_gpu_calc_iter_MAP_THREADS_2(_l0, _u0, _s0, _l1, _u1, _s1, _i0, _i1) \
100 if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
101 if ((blockIdx.y * blockDim.y + threadIdx.y) >= _XMP_M_COUNT_TRIPLETi(_l1, (_u1 - 1), _s1)) return; \
103 _i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
104 _i1 = _l1 + ((blockIdx.y * blockDim.y + threadIdx.y) * _s1); \
107 #define _XMP_gpu_calc_iter_MAP_THREADS_3(_l0, _u0, _s0, _l1, _u1, _s1, _l2, _u2, _s2, _i0, _i1, _i2) \
109 if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
110 if ((blockIdx.y * blockDim.y + threadIdx.y) >= _XMP_M_COUNT_TRIPLETi(_l1, (_u1 - 1), _s1)) return; \
111 if ((blockIdx.z * blockDim.z + threadIdx.z) >= _XMP_M_COUNT_TRIPLETi(_l2, (_u2 - 1), _s2)) return; \
113 _i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
114 _i1 = _l1 + ((blockIdx.y * blockDim.y + threadIdx.y) * _s1); \
115 _i2 = _l2 + ((blockIdx.z * blockDim.z + threadIdx.z) * _s2); \
118 #define _XMP_GPU_M_CALC_CONFIG_PARAMS(_x, _y, _z) \
120 unsigned long long num_threads = _x * _y * _z; \
122 *total_iter = total_iter_v; \
128 if (num_threads > _XMP_gpu_max_thread) { \
129 _XMP_fatal("too many threads are requested for GPU"); \
132 if (num_threads >= total_iter_v) { \
139 total_iter_v = _XMP_M_CEILi(total_iter_v, num_threads); \
141 if (total_iter_v > _XMP_gpu_max_block_dim_x) { \
142 *block_x = _XMP_gpu_max_block_dim_x; \
144 total_iter_v = _XMP_M_CEILi(total_iter_v, _XMP_gpu_max_block_dim_x); \
145 if (total_iter_v > _XMP_gpu_max_block_dim_y) { \
146 *block_y = _XMP_gpu_max_block_dim_y; \
148 total_iter_v = _XMP_M_CEILi(total_iter_v, _XMP_gpu_max_block_dim_y); \
149 if (total_iter_v > _XMP_gpu_max_block_dim_z) { \
150 _XMP_fatal("data is too big for GPU"); \
152 *block_z = total_iter_v; \
155 *block_y = total_iter_v; \
159 *block_x = total_iter_v; \
167 int *block_x,
int *block_y,
int *block_z,
168 int *thread_x,
int *thread_y,
int *thread_z,
169 T lower0, T upper0, T stride0) {
176 int *block_x,
int *block_y,
int *block_z,
177 int *thread_x,
int *thread_y,
int *thread_z,
178 T lower0, T upper0, T stride0,
179 T lower1, T upper1, T stride1) {
187 int *block_x,
int *block_y,
int *block_z,
188 int *thread_x,
int *thread_y,
int *thread_z,
189 T lower0, T upper0, T stride0,
190 T lower1, T upper1, T stride1,
191 T lower2, T upper2, T stride2) {
200 int *thread_x,
int *thread_y,
int *thread_z,
202 T lower0, T upper0, T stride0) {
205 *thread_x = thread_x_v;
216 int *thread_x,
int *thread_y,
int *thread_z,
217 int thread_x_v,
int thread_y_v,
218 T lower0, T upper0, T stride0,
219 T lower1, T upper1, T stride1) {
223 *thread_x = thread_x_v;
224 *thread_y = thread_y_v;
234 int *thread_x,
int *thread_y,
int *thread_z,
235 int thread_x_v,
int thread_y_v,
int thread_z_v,
236 T lower0, T upper0, T stride0,
237 T lower1, T upper1, T stride1,
238 T lower2, T upper2, T stride2) {
243 *thread_x = thread_x_v;
244 *thread_y = thread_y_v;
245 *thread_z = thread_z_v;
252 #endif // _XMP_GPU_RUNTIME_FUNC_DECL