libxmp/libxmpf in Omni Compiler  1.3.4
xmp_gpu_func.hpp
Go to the documentation of this file.
1 /*
2  * $TSUKUBA_Release: $
3  * $TSUKUBA_Copyright:
4  * $
5  */
6 
7 #ifndef _XMP_GPU_RUNTIME_FUNC_DECL
8 #define _XMP_GPU_RUNTIME_FUNC_DECL
9 
10 #include "xmp_constant.h"
11 #include "xmp_data_struct.h"
12 #include "xmp_index_macro.h"
13 
14 // - index functions -----------------------------------------------------------------------------------------------
15 #define _XMP_GPU_M_GTOL(_desc, _dim) \
16 (((_XMP_gpu_array_t *)_desc)[_dim].gtol)
17 #define _XMP_GPU_M_ACC(_desc, _dim) \
18 (((_XMP_gpu_array_t *)_desc)[_dim].acc)
19 
20 // --- integer functions
21 // calculate ceil(a/b)
22 #define _XMP_M_CEILi(a_, b_) (((a_) % (b_)) == 0 ? ((a_) / (b_)) : ((a_) / (b_)) + 1)
23 // calculate floor(a/b)
24 #define _XMP_M_FLOORi(a_, b_) ((a_) / (b_))
25 #define _XMP_M_COUNT_TRIPLETi(l_, u_, s_) (_XMP_M_FLOORi(((u_) - (l_)), s_) + 1)
26 
27 // --- cuda barrier functions
28 #define _XMP_GPU_M_BARRIER_THREADS() __syncthreads()
29 #define _XMP_GPU_M_BARRIER_KERNEL() cudaThreadSynchronize()
30 
31 // --- get array info functions
32 #define _XMP_GPU_M_GET_ARRAY_GTOL(_gtol, _desc, _dim) \
33 _gtol = _XMP_GPU_M_GTOL(_desc, _dim)
34 #define _XMP_GPU_M_GET_ARRAY_ACC(_acc, _desc, _dim) \
35 _acc = _XMP_GPU_M_ACC(_desc, _dim)
36 
37 extern "C" void _XMP_fatal(char *msg);
38 
39 extern int _XMP_gpu_max_thread;
40 
41 extern int _XMP_gpu_max_block_dim_x;
42 extern int _XMP_gpu_max_block_dim_y;
43 extern int _XMP_gpu_max_block_dim_z;
44 
45 template<typename T>
46 __device__ void _XMP_gpu_calc_thread_id(T *index) {
47  *index = threadIdx.x +
48  (threadIdx.y * blockDim.x) +
49  (threadIdx.z * blockDim.x * blockDim.y) +
50  ((blockIdx.x +
51  (blockIdx.y * gridDim.x) +
52  (blockIdx.z * gridDim.x * gridDim.y)) * (blockDim.x * blockDim.y * blockDim.z));
53 }
54 
55 template<typename T>
56 __device__ void _XMP_gpu_calc_iter(unsigned long long tid,
57  T lower0, T upper0, T stride0,
58  T *iter0) {
59  *iter0 = lower0 + (tid * stride0);
60 }
61 
62 template<typename T>
63 __device__ void _XMP_gpu_calc_iter(unsigned long long tid,
64  T lower0, T upper0, T stride0,
65  T lower1, T upper1, T stride1,
66  T *iter0,
67  T *iter1) {
68  T count0 = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
69 
70  *iter0 = lower0 + ((tid % count0) * stride0);
71  *iter1 = lower1 + ((tid / count0) * stride1);
72 }
73 
74 template<typename T>
75 __device__ void _XMP_gpu_calc_iter(unsigned long long tid,
76  T lower0, T upper0, T stride0,
77  T lower1, T upper1, T stride1,
78  T lower2, T upper2, T stride2,
79  T *iter0,
80  T *iter1,
81  T *iter2) {
82  T count0 = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
83  T count1 = _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
84 
85  T temp1 = tid / count0;
86  *iter0 = lower0 + ((tid % count0) * stride0);
87  *iter1 = lower1 + ((temp1 % count1) * stride1);
88  *iter2 = lower2 + ((temp1 / count1) * stride2);
89 }
90 
91 #define _XMP_gpu_calc_iter_MAP_THREADS_1(_l0, _u0, _s0, _i0) \
92 { \
93  if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
94  \
95  _i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
96 }
97 
98 #define _XMP_gpu_calc_iter_MAP_THREADS_2(_l0, _u0, _s0, _l1, _u1, _s1, _i0, _i1) \
99 { \
100  if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
101  if ((blockIdx.y * blockDim.y + threadIdx.y) >= _XMP_M_COUNT_TRIPLETi(_l1, (_u1 - 1), _s1)) return; \
102  \
103  _i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
104  _i1 = _l1 + ((blockIdx.y * blockDim.y + threadIdx.y) * _s1); \
105 }
106 
107 #define _XMP_gpu_calc_iter_MAP_THREADS_3(_l0, _u0, _s0, _l1, _u1, _s1, _l2, _u2, _s2, _i0, _i1, _i2) \
108 { \
109  if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
110  if ((blockIdx.y * blockDim.y + threadIdx.y) >= _XMP_M_COUNT_TRIPLETi(_l1, (_u1 - 1), _s1)) return; \
111  if ((blockIdx.z * blockDim.z + threadIdx.z) >= _XMP_M_COUNT_TRIPLETi(_l2, (_u2 - 1), _s2)) return; \
112  \
113  _i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
114  _i1 = _l1 + ((blockIdx.y * blockDim.y + threadIdx.y) * _s1); \
115  _i2 = _l2 + ((blockIdx.z * blockDim.z + threadIdx.z) * _s2); \
116 }
117 
118 #define _XMP_GPU_M_CALC_CONFIG_PARAMS(_x, _y, _z) \
119 { \
120  unsigned long long num_threads = _x * _y * _z; \
121 \
122  *total_iter = total_iter_v; \
123 \
124  *thread_x = _x; \
125  *thread_y = _y; \
126  *thread_z = _z; \
127 \
128  if (num_threads > _XMP_gpu_max_thread) { \
129  _XMP_fatal("too many threads are requested for GPU"); \
130  } \
131 \
132  if (num_threads >= total_iter_v) { \
133  *block_x = 1; \
134  *block_y = 1; \
135  *block_z = 1; \
136  return; \
137  } \
138 \
139  total_iter_v = _XMP_M_CEILi(total_iter_v, num_threads); \
140 \
141  if (total_iter_v > _XMP_gpu_max_block_dim_x) { \
142  *block_x = _XMP_gpu_max_block_dim_x; \
143 \
144  total_iter_v = _XMP_M_CEILi(total_iter_v, _XMP_gpu_max_block_dim_x); \
145  if (total_iter_v > _XMP_gpu_max_block_dim_y) { \
146  *block_y = _XMP_gpu_max_block_dim_y; \
147 \
148  total_iter_v = _XMP_M_CEILi(total_iter_v, _XMP_gpu_max_block_dim_y); \
149  if (total_iter_v > _XMP_gpu_max_block_dim_z) { \
150  _XMP_fatal("data is too big for GPU"); \
151  } else { \
152  *block_z = total_iter_v; \
153  } \
154  } else { \
155  *block_y = total_iter_v; \
156  *block_z = 1; \
157  } \
158  } else { \
159  *block_x = total_iter_v; \
160  *block_y = 1; \
161  *block_z = 1; \
162  } \
163 }
164 
165 template<typename T>
166 void _XMP_gpu_calc_config_params(unsigned long long *total_iter,
167  int *block_x, int *block_y, int *block_z,
168  int *thread_x, int *thread_y, int *thread_z,
169  T lower0, T upper0, T stride0) {
170  unsigned long long total_iter_v = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
172 }
173 
174 template<typename T>
175 void _XMP_gpu_calc_config_params(unsigned long long *total_iter,
176  int *block_x, int *block_y, int *block_z,
177  int *thread_x, int *thread_y, int *thread_z,
178  T lower0, T upper0, T stride0,
179  T lower1, T upper1, T stride1) {
180  unsigned long long total_iter_v = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0)
181  * _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
183 }
184 
185 template<typename T>
186 void _XMP_gpu_calc_config_params(unsigned long long *total_iter,
187  int *block_x, int *block_y, int *block_z,
188  int *thread_x, int *thread_y, int *thread_z,
189  T lower0, T upper0, T stride0,
190  T lower1, T upper1, T stride1,
191  T lower2, T upper2, T stride2) {
192  unsigned long long total_iter_v = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0)
193  * _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1)
194  * _XMP_M_COUNT_TRIPLETi(lower2, (upper2 - 1), stride2);
196 }
197 
198 template<typename T>
199 void _XMP_gpu_calc_config_params_MAP_THREADS(int *block_x, int *block_y, int *block_z,
200  int *thread_x, int *thread_y, int *thread_z,
201  int thread_x_v,
202  T lower0, T upper0, T stride0) {
203  T iter_x = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
204 
205  *thread_x = thread_x_v;
206  *thread_y = 1;
207  *thread_z = 1;
208 
209  *block_x = _XMP_M_CEILi(iter_x, thread_x_v);
210  *block_y = 1;
211  *block_z = 1;
212 }
213 
214 template<typename T>
215 void _XMP_gpu_calc_config_params_MAP_THREADS(int *block_x, int *block_y, int *block_z,
216  int *thread_x, int *thread_y, int *thread_z,
217  int thread_x_v, int thread_y_v,
218  T lower0, T upper0, T stride0,
219  T lower1, T upper1, T stride1) {
220  T iter_x = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
221  T iter_y = _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
222 
223  *thread_x = thread_x_v;
224  *thread_y = thread_y_v;
225  *thread_z = 1;
226 
227  *block_x = _XMP_M_CEILi(iter_x, thread_x_v);
228  *block_y = _XMP_M_CEILi(iter_y, thread_y_v);
229  *block_z = 1;
230 }
231 
232 template<typename T>
233 void _XMP_gpu_calc_config_params_MAP_THREADS(int *block_x, int *block_y, int *block_z,
234  int *thread_x, int *thread_y, int *thread_z,
235  int thread_x_v, int thread_y_v, int thread_z_v,
236  T lower0, T upper0, T stride0,
237  T lower1, T upper1, T stride1,
238  T lower2, T upper2, T stride2) {
239  T iter_x = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
240  T iter_y = _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
241  T iter_z = _XMP_M_COUNT_TRIPLETi(lower2, (upper2 - 1), stride2);
242 
243  *thread_x = thread_x_v;
244  *thread_y = thread_y_v;
245  *thread_z = thread_z_v;
246 
247  *block_x = _XMP_M_CEILi(iter_x, thread_x_v);
248  *block_y = _XMP_M_CEILi(iter_y, thread_y_v);
249  *block_z = _XMP_M_CEILi(iter_z, thread_z_v);
250 }
251 
252 #endif // _XMP_GPU_RUNTIME_FUNC_DECL
xmp_data_struct.h
_XMP_M_CEILi
#define _XMP_M_CEILi(a_, b_)
Definition: xmp_gpu_func.hpp:22
_XMP_gpu_max_block_dim_y
int _XMP_gpu_max_block_dim_y
_XMP_gpu_calc_config_params
void _XMP_gpu_calc_config_params(unsigned long long *total_iter, int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, T lower0, T upper0, T stride0)
Definition: xmp_gpu_func.hpp:166
_XMP_GPU_M_CALC_CONFIG_PARAMS
#define _XMP_GPU_M_CALC_CONFIG_PARAMS(_x, _y, _z)
Definition: xmp_gpu_func.hpp:118
_XMP_gpu_calc_iter
__device__ void _XMP_gpu_calc_iter(unsigned long long tid, T lower0, T upper0, T stride0, T *iter0)
Definition: xmp_gpu_func.hpp:56
_XMP_fatal
void _XMP_fatal(char *msg)
Definition: xmp_util.c:42
_XMP_gpu_max_block_dim_x
int _XMP_gpu_max_block_dim_x
_XMP_gpu_max_block_dim_z
int _XMP_gpu_max_block_dim_z
_XMP_gpu_calc_thread_id
__device__ void _XMP_gpu_calc_thread_id(T *index)
Definition: xmp_gpu_func.hpp:46
xmp_index_macro.h
_XMP_gpu_max_thread
int _XMP_gpu_max_thread
_XMP_M_COUNT_TRIPLETi
#define _XMP_M_COUNT_TRIPLETi(l_, u_, s_)
Definition: xmp_gpu_func.hpp:25
_XMP_gpu_calc_config_params_MAP_THREADS
void _XMP_gpu_calc_config_params_MAP_THREADS(int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, int thread_x_v, T lower0, T upper0, T stride0)
Definition: xmp_gpu_func.hpp:199
xmp_constant.h