libxmp/libxmpf in Omni Compiler  1.3.4
xmp_gpu_func.hpp File Reference
#include "xmp_constant.h"
#include "xmp_data_struct.h"
#include "xmp_index_macro.h"
Include dependency graph for xmp_gpu_func.hpp:

Go to the source code of this file.

Macros

#define _XMP_GPU_M_GTOL(_desc, _dim)   (((_XMP_gpu_array_t *)_desc)[_dim].gtol)
 
#define _XMP_GPU_M_ACC(_desc, _dim)   (((_XMP_gpu_array_t *)_desc)[_dim].acc)
 
#define _XMP_M_CEILi(a_, b_)   (((a_) % (b_)) == 0 ? ((a_) / (b_)) : ((a_) / (b_)) + 1)
 
#define _XMP_M_FLOORi(a_, b_)   ((a_) / (b_))
 
#define _XMP_M_COUNT_TRIPLETi(l_, u_, s_)   (_XMP_M_FLOORi(((u_) - (l_)), s_) + 1)
 
#define _XMP_GPU_M_BARRIER_THREADS()   __syncthreads()
 
#define _XMP_GPU_M_BARRIER_KERNEL()   cudaThreadSynchronize()
 
#define _XMP_GPU_M_GET_ARRAY_GTOL(_gtol, _desc, _dim)   _gtol = _XMP_GPU_M_GTOL(_desc, _dim)
 
#define _XMP_GPU_M_GET_ARRAY_ACC(_acc, _desc, _dim)   _acc = _XMP_GPU_M_ACC(_desc, _dim)
 
#define _XMP_gpu_calc_iter_MAP_THREADS_1(_l0, _u0, _s0, _i0)
 
#define _XMP_gpu_calc_iter_MAP_THREADS_2(_l0, _u0, _s0, _l1, _u1, _s1, _i0, _i1)
 
#define _XMP_gpu_calc_iter_MAP_THREADS_3(_l0, _u0, _s0, _l1, _u1, _s1, _l2, _u2, _s2, _i0, _i1, _i2)
 
#define _XMP_GPU_M_CALC_CONFIG_PARAMS(_x, _y, _z)
 

Functions

void _XMP_fatal (char *msg)
 
template<typename T >
__device__ void _XMP_gpu_calc_thread_id (T *index)
 
template<typename T >
__device__ void _XMP_gpu_calc_iter (unsigned long long tid, T lower0, T upper0, T stride0, T *iter0)
 
template<typename T >
__device__ void _XMP_gpu_calc_iter (unsigned long long tid, T lower0, T upper0, T stride0, T lower1, T upper1, T stride1, T *iter0, T *iter1)
 
template<typename T >
__device__ void _XMP_gpu_calc_iter (unsigned long long tid, T lower0, T upper0, T stride0, T lower1, T upper1, T stride1, T lower2, T upper2, T stride2, T *iter0, T *iter1, T *iter2)
 
template<typename T >
void _XMP_gpu_calc_config_params (unsigned long long *total_iter, int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, T lower0, T upper0, T stride0)
 
template<typename T >
void _XMP_gpu_calc_config_params (unsigned long long *total_iter, int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, T lower0, T upper0, T stride0, T lower1, T upper1, T stride1)
 
template<typename T >
void _XMP_gpu_calc_config_params (unsigned long long *total_iter, int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, T lower0, T upper0, T stride0, T lower1, T upper1, T stride1, T lower2, T upper2, T stride2)
 
template<typename T >
void _XMP_gpu_calc_config_params_MAP_THREADS (int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, int thread_x_v, T lower0, T upper0, T stride0)
 
template<typename T >
void _XMP_gpu_calc_config_params_MAP_THREADS (int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, int thread_x_v, int thread_y_v, T lower0, T upper0, T stride0, T lower1, T upper1, T stride1)
 
template<typename T >
void _XMP_gpu_calc_config_params_MAP_THREADS (int *block_x, int *block_y, int *block_z, int *thread_x, int *thread_y, int *thread_z, int thread_x_v, int thread_y_v, int thread_z_v, T lower0, T upper0, T stride0, T lower1, T upper1, T stride1, T lower2, T upper2, T stride2)
 

Variables

int _XMP_gpu_max_thread
 
int _XMP_gpu_max_block_dim_x
 
int _XMP_gpu_max_block_dim_y
 
int _XMP_gpu_max_block_dim_z
 

Macro Definition Documentation

◆ _XMP_gpu_calc_iter_MAP_THREADS_1

#define _XMP_gpu_calc_iter_MAP_THREADS_1 (   _l0,
  _u0,
  _s0,
  _i0 
)
Value:
{ \
if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
\
_i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
}

◆ _XMP_gpu_calc_iter_MAP_THREADS_2

#define _XMP_gpu_calc_iter_MAP_THREADS_2 (   _l0,
  _u0,
  _s0,
  _l1,
  _u1,
  _s1,
  _i0,
  _i1 
)
Value:
{ \
if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
if ((blockIdx.y * blockDim.y + threadIdx.y) >= _XMP_M_COUNT_TRIPLETi(_l1, (_u1 - 1), _s1)) return; \
\
_i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
_i1 = _l1 + ((blockIdx.y * blockDim.y + threadIdx.y) * _s1); \
}

◆ _XMP_gpu_calc_iter_MAP_THREADS_3

#define _XMP_gpu_calc_iter_MAP_THREADS_3 (   _l0,
  _u0,
  _s0,
  _l1,
  _u1,
  _s1,
  _l2,
  _u2,
  _s2,
  _i0,
  _i1,
  _i2 
)
Value:
{ \
if ((blockIdx.x * blockDim.x + threadIdx.x) >= _XMP_M_COUNT_TRIPLETi(_l0, (_u0 - 1), _s0)) return; \
if ((blockIdx.y * blockDim.y + threadIdx.y) >= _XMP_M_COUNT_TRIPLETi(_l1, (_u1 - 1), _s1)) return; \
if ((blockIdx.z * blockDim.z + threadIdx.z) >= _XMP_M_COUNT_TRIPLETi(_l2, (_u2 - 1), _s2)) return; \
\
_i0 = _l0 + ((blockIdx.x * blockDim.x + threadIdx.x) * _s0); \
_i1 = _l1 + ((blockIdx.y * blockDim.y + threadIdx.y) * _s1); \
_i2 = _l2 + ((blockIdx.z * blockDim.z + threadIdx.z) * _s2); \
}

◆ _XMP_GPU_M_ACC

#define _XMP_GPU_M_ACC (   _desc,
  _dim 
)    (((_XMP_gpu_array_t *)_desc)[_dim].acc)

◆ _XMP_GPU_M_BARRIER_KERNEL

#define _XMP_GPU_M_BARRIER_KERNEL ( )    cudaThreadSynchronize()

◆ _XMP_GPU_M_BARRIER_THREADS

#define _XMP_GPU_M_BARRIER_THREADS ( )    __syncthreads()

◆ _XMP_GPU_M_CALC_CONFIG_PARAMS

#define _XMP_GPU_M_CALC_CONFIG_PARAMS (   _x,
  _y,
  _z 
)

◆ _XMP_GPU_M_GET_ARRAY_ACC

#define _XMP_GPU_M_GET_ARRAY_ACC (   _acc,
  _desc,
  _dim 
)    _acc = _XMP_GPU_M_ACC(_desc, _dim)

◆ _XMP_GPU_M_GET_ARRAY_GTOL

#define _XMP_GPU_M_GET_ARRAY_GTOL (   _gtol,
  _desc,
  _dim 
)    _gtol = _XMP_GPU_M_GTOL(_desc, _dim)

◆ _XMP_GPU_M_GTOL

#define _XMP_GPU_M_GTOL (   _desc,
  _dim 
)    (((_XMP_gpu_array_t *)_desc)[_dim].gtol)

◆ _XMP_M_CEILi

#define _XMP_M_CEILi (   a_,
  b_ 
)    (((a_) % (b_)) == 0 ? ((a_) / (b_)) : ((a_) / (b_)) + 1)

◆ _XMP_M_COUNT_TRIPLETi

#define _XMP_M_COUNT_TRIPLETi (   l_,
  u_,
  s_ 
)    (_XMP_M_FLOORi(((u_) - (l_)), s_) + 1)

◆ _XMP_M_FLOORi

#define _XMP_M_FLOORi (   a_,
  b_ 
)    ((a_) / (b_))

Function Documentation

◆ _XMP_fatal()

void _XMP_fatal ( char *  msg)
43 {
44  fprintf(stderr, "[RANK:%d] XcalableMP runtime error: %s\n", _XMP_world_rank, msg);
45  MPI_Abort(MPI_COMM_WORLD, 1);
46 }

◆ _XMP_gpu_calc_config_params() [1/3]

template<typename T >
void _XMP_gpu_calc_config_params ( unsigned long long *  total_iter,
int *  block_x,
int *  block_y,
int *  block_z,
int *  thread_x,
int *  thread_y,
int *  thread_z,
lower0,
upper0,
stride0 
)
169  {
170  unsigned long long total_iter_v = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
172 }

◆ _XMP_gpu_calc_config_params() [2/3]

template<typename T >
void _XMP_gpu_calc_config_params ( unsigned long long *  total_iter,
int *  block_x,
int *  block_y,
int *  block_z,
int *  thread_x,
int *  thread_y,
int *  thread_z,
lower0,
upper0,
stride0,
lower1,
upper1,
stride1 
)
179  {
180  unsigned long long total_iter_v = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0)
181  * _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
183 }

◆ _XMP_gpu_calc_config_params() [3/3]

template<typename T >
void _XMP_gpu_calc_config_params ( unsigned long long *  total_iter,
int *  block_x,
int *  block_y,
int *  block_z,
int *  thread_x,
int *  thread_y,
int *  thread_z,
lower0,
upper0,
stride0,
lower1,
upper1,
stride1,
lower2,
upper2,
stride2 
)
191  {
192  unsigned long long total_iter_v = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0)
193  * _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1)
194  * _XMP_M_COUNT_TRIPLETi(lower2, (upper2 - 1), stride2);
196 }

◆ _XMP_gpu_calc_config_params_MAP_THREADS() [1/3]

template<typename T >
void _XMP_gpu_calc_config_params_MAP_THREADS ( int *  block_x,
int *  block_y,
int *  block_z,
int *  thread_x,
int *  thread_y,
int *  thread_z,
int  thread_x_v,
int  thread_y_v,
int  thread_z_v,
lower0,
upper0,
stride0,
lower1,
upper1,
stride1,
lower2,
upper2,
stride2 
)
238  {
239  T iter_x = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
240  T iter_y = _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
241  T iter_z = _XMP_M_COUNT_TRIPLETi(lower2, (upper2 - 1), stride2);
242 
243  *thread_x = thread_x_v;
244  *thread_y = thread_y_v;
245  *thread_z = thread_z_v;
246 
247  *block_x = _XMP_M_CEILi(iter_x, thread_x_v);
248  *block_y = _XMP_M_CEILi(iter_y, thread_y_v);
249  *block_z = _XMP_M_CEILi(iter_z, thread_z_v);
250 }

◆ _XMP_gpu_calc_config_params_MAP_THREADS() [2/3]

template<typename T >
void _XMP_gpu_calc_config_params_MAP_THREADS ( int *  block_x,
int *  block_y,
int *  block_z,
int *  thread_x,
int *  thread_y,
int *  thread_z,
int  thread_x_v,
int  thread_y_v,
lower0,
upper0,
stride0,
lower1,
upper1,
stride1 
)
219  {
220  T iter_x = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
221  T iter_y = _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
222 
223  *thread_x = thread_x_v;
224  *thread_y = thread_y_v;
225  *thread_z = 1;
226 
227  *block_x = _XMP_M_CEILi(iter_x, thread_x_v);
228  *block_y = _XMP_M_CEILi(iter_y, thread_y_v);
229  *block_z = 1;
230 }

◆ _XMP_gpu_calc_config_params_MAP_THREADS() [3/3]

template<typename T >
void _XMP_gpu_calc_config_params_MAP_THREADS ( int *  block_x,
int *  block_y,
int *  block_z,
int *  thread_x,
int *  thread_y,
int *  thread_z,
int  thread_x_v,
lower0,
upper0,
stride0 
)
202  {
203  T iter_x = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
204 
205  *thread_x = thread_x_v;
206  *thread_y = 1;
207  *thread_z = 1;
208 
209  *block_x = _XMP_M_CEILi(iter_x, thread_x_v);
210  *block_y = 1;
211  *block_z = 1;
212 }

◆ _XMP_gpu_calc_iter() [1/3]

template<typename T >
__device__ void _XMP_gpu_calc_iter ( unsigned long long  tid,
lower0,
upper0,
stride0,
T *  iter0 
)
58  {
59  *iter0 = lower0 + (tid * stride0);
60 }

◆ _XMP_gpu_calc_iter() [2/3]

template<typename T >
__device__ void _XMP_gpu_calc_iter ( unsigned long long  tid,
lower0,
upper0,
stride0,
lower1,
upper1,
stride1,
T *  iter0,
T *  iter1 
)
67  {
68  T count0 = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
69 
70  *iter0 = lower0 + ((tid % count0) * stride0);
71  *iter1 = lower1 + ((tid / count0) * stride1);
72 }

◆ _XMP_gpu_calc_iter() [3/3]

template<typename T >
__device__ void _XMP_gpu_calc_iter ( unsigned long long  tid,
lower0,
upper0,
stride0,
lower1,
upper1,
stride1,
lower2,
upper2,
stride2,
T *  iter0,
T *  iter1,
T *  iter2 
)
81  {
82  T count0 = _XMP_M_COUNT_TRIPLETi(lower0, (upper0 - 1), stride0);
83  T count1 = _XMP_M_COUNT_TRIPLETi(lower1, (upper1 - 1), stride1);
84 
85  T temp1 = tid / count0;
86  *iter0 = lower0 + ((tid % count0) * stride0);
87  *iter1 = lower1 + ((temp1 % count1) * stride1);
88  *iter2 = lower2 + ((temp1 / count1) * stride2);
89 }

◆ _XMP_gpu_calc_thread_id()

template<typename T >
__device__ void _XMP_gpu_calc_thread_id ( T *  index)
46  {
47  *index = threadIdx.x +
48  (threadIdx.y * blockDim.x) +
49  (threadIdx.z * blockDim.x * blockDim.y) +
50  ((blockIdx.x +
51  (blockIdx.y * gridDim.x) +
52  (blockIdx.z * gridDim.x * gridDim.y)) * (blockDim.x * blockDim.y * blockDim.z));
53 }

Variable Documentation

◆ _XMP_gpu_max_block_dim_x

int _XMP_gpu_max_block_dim_x

◆ _XMP_gpu_max_block_dim_y

int _XMP_gpu_max_block_dim_y

◆ _XMP_gpu_max_block_dim_z

int _XMP_gpu_max_block_dim_z

◆ _XMP_gpu_max_thread

int _XMP_gpu_max_thread
_XMP_M_CEILi
#define _XMP_M_CEILi(a_, b_)
Definition: xmp_gpu_func.hpp:22
_XMP_GPU_M_CALC_CONFIG_PARAMS
#define _XMP_GPU_M_CALC_CONFIG_PARAMS(_x, _y, _z)
Definition: xmp_gpu_func.hpp:118
_XMP_world_rank
int _XMP_world_rank
Definition: xmp_world.c:9
_XMP_M_COUNT_TRIPLETi
#define _XMP_M_COUNT_TRIPLETi(l_, u_, s_)
Definition: xmp_gpu_func.hpp:25