354 XACC_DEBUG(
"unpack_vector, dst=%p, dst_off=%zd, src=%p, src_off=%zd, blklen=%zd, stride=%zd, count=%zd, typesize=%zd, queue=%p, is_blocking=%d\n",
355 dst_mem, dst_offset, src_mem, src_offset, blocklength, stride, count, typesize, queue, is_blocking);
357 const int numThreads = 128;
363 size_t blocklength_e = blocklength / typesize;
364 size_t stride_e = stride / typesize;
365 size_t dst_offset_e = dst_offset / typesize;
366 size_t src_offset_e = src_offset / typesize;
368 #ifdef _XMP_XACC_PZCL
369 const int max_num_threads = 8192;
370 int num_threads = ADJUST_GLOBAL_WORK_SIZE(blocklength_e * count);
372 cl_uint work_dim = 1;
373 size_t global_work_size[] = {num_threads};
374 size_t local_work_size[] = {8};
376 if(global_work_size[0] > 8192 || global_work_size[0] % 128 != 0){
377 _XMP_fatal(
"invalid global_work_size at unpack vector");
383 if(blocklength_e >= numThreads){
386 while(tx < blocklength_e){
390 ty = numThreads / tx;
391 by = (count-1)/ty + 1;
393 cl_uint work_dim = 2;
394 size_t global_work_size[] = {bx*tx, by*ty};
395 size_t local_work_size[] = {tx, ty};
398 void *args[] = {&dst_mem, &dst_offset_e, &src_mem, &src_offset_e, &blocklength_e, &stride_e, &count};
399 size_t arg_sizes[] = {
sizeof(dst_mem),
sizeof(dst_offset_e),
sizeof(src_mem),
sizeof(src_offset_e),
sizeof(blocklength_e),
sizeof(stride_e),
sizeof(count)};
404 enqueue_kernel(queue, _kernels[_XACC_unpack_vector_8], 7, args, arg_sizes, work_dim, global_work_size, local_work_size);
408 enqueue_kernel(queue, _kernels[_XACC_unpack_vector_16], 7, args, arg_sizes, work_dim, global_work_size, local_work_size);
412 enqueue_kernel(queue, _kernels[_XACC_unpack_vector_32], 7, args, arg_sizes, work_dim, global_work_size, local_work_size);
416 enqueue_kernel(queue, _kernels[_XACC_unpack_vector_64], 7, args, arg_sizes, work_dim, global_work_size, local_work_size);
420 void *args_default[] = {&dst_mem, &dst_offset, &src_mem, &src_offset, &blocklength, &stride, &count};
421 size_t arg_sizes_default[] = {
sizeof(dst_mem),
sizeof(dst_offset),
sizeof(src_mem),
sizeof(src_offset),
sizeof(blocklength),
sizeof(stride),
sizeof(count)};
424 enqueue_kernel(queue, _kernels[_XACC_unpack_vector_8], 7, args_default, arg_sizes_default, work_dim, global_work_size, local_work_size);