Change arg type to unsigned int from signed int (#1078)

* Change arg type to unsigned int from signed int

* Fix formatting issues
This commit is contained in:
Grzegorz Wawiorko
2021-02-02 17:43:37 +01:00
committed by GitHub
parent ca673af488
commit 216455842d
3 changed files with 333 additions and 278 deletions

View File

@@ -1,6 +1,6 @@
// //
// Copyright (c) 2017 The Khronos Group Inc. // Copyright (c) 2017 The Khronos Group Inc.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
@@ -16,27 +16,33 @@
#include "common.h" #include "common.h"
static char hash_table_kernel[] = static char hash_table_kernel[] =
"#if 0\n" "#if 0\n"
"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n" "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n"
"#endif\n" "#endif\n"
"typedef struct BinNode {\n" "typedef struct BinNode {\n"
" int value;\n" " int value;\n"
" atomic_uintptr_t pNext;\n" " atomic_uintptr_t pNext;\n"
"} BinNode;\n" "} BinNode;\n"
"__kernel void build_hash_table(__global uint* input, __global BinNode* pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n" "__kernel void build_hash_table(__global uint* input, __global BinNode* "
"{\n" "pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n"
" __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, 1, memory_order_relaxed, memory_scope_all_svm_devices) ];\n" "{\n"
" uint i = get_global_id(0);\n" " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, "
" uint b = input[i] % numBins;\n" "1u, memory_order_relaxed, memory_scope_all_svm_devices) ];\n"
" pNew->value = input[i];\n" " uint i = get_global_id(0);\n"
" uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), memory_order_seq_cst, memory_scope_all_svm_devices);\n" " uint b = input[i] % numBins;\n"
" do\n" " pNew->value = input[i];\n"
" {\n" " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), "
" atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, memory_scope_all_svm_devices);\n" // always inserting at head of list "memory_order_seq_cst, memory_scope_all_svm_devices);\n"
" } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), &next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, memory_scope_all_svm_devices));\n" " do\n"
"}\n"; " {\n"
" atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, "
"memory_scope_all_svm_devices);\n" // always inserting at head of list
" } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), "
"&next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, "
"memory_scope_all_svm_devices));\n"
"}\n";
typedef struct BinNode{ typedef struct BinNode{
cl_uint value; cl_uint value;

View File

@@ -1,6 +1,6 @@
// //
// Copyright (c) 2017 The Khronos Group Inc. // Copyright (c) 2017 The Khronos Group Inc.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
@@ -17,15 +17,19 @@
const char *find_targets_kernel[] = { const char *find_targets_kernel[] = {
"__kernel void find_targets(__global uint* image, uint target, volatile __global atomic_uint *numTargetsFound, volatile __global atomic_uint *targetLocations)\n" "__kernel void find_targets(__global uint* image, uint target, volatile "
"{\n" "__global atomic_uint *numTargetsFound, volatile __global atomic_uint "
" size_t i = get_global_id(0);\n" "*targetLocations)\n"
" uint index;\n" "{\n"
" if(image[i] == target) {\n" " size_t i = get_global_id(0);\n"
" index = atomic_fetch_add_explicit(numTargetsFound, 1, memory_order_relaxed, memory_scope_device); \n" " uint index;\n"
" atomic_exchange_explicit(&targetLocations[index], i, memory_order_relaxed, memory_scope_all_svm_devices); \n" " if(image[i] == target) {\n"
" }\n" " index = atomic_fetch_add_explicit(numTargetsFound, 1u, "
"}\n" "memory_order_relaxed, memory_scope_device); \n"
" atomic_exchange_explicit(&targetLocations[index], i, "
"memory_order_relaxed, memory_scope_all_svm_devices); \n"
" }\n"
"}\n"
}; };

View File

@@ -1,6 +1,6 @@
// //
// Copyright (c) 2017 The Khronos Group Inc. // Copyright (c) 2017 The Khronos Group Inc.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
// You may obtain a copy of the License at // You may obtain a copy of the License at
@@ -27,271 +27,316 @@
#ifdef CL_VERSION_2_0 #ifdef CL_VERSION_2_0
extern int gWimpyMode; extern int gWimpyMode;
static const char* helper_ndrange_1d_glo[] = static const char *helper_ndrange_1d_glo[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
NL, "}" "memory_order_relaxed, memory_scope_device);" NL,
NL, "" "}" NL,
NL, "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)" "" NL,
NL, "{" "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, "
NL, " size_t tid = get_global_id(0);" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "atomic_uint* val, __global uint* ofs_arr)" NL,
NL, "" "{" NL,
NL, " for(int i = 0; i < n; i++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" "" NL,
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" " for(int i = 0; i < n; i++)" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " {" NL,
NL, " }" " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL,
NL, "}" " int enq_res = enqueue_kernel(get_default_queue(), "
NL "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
" if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_1d_loc[] = static const char *helper_ndrange_1d_loc[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
NL, "}" "memory_order_relaxed, memory_scope_device);" NL,
NL, "" "}" NL,
NL, "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)" "" NL,
NL, "{" "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, "
NL, " size_t tid = get_global_id(0);" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "atomic_uint* val, __global uint* ofs_arr)" NL,
NL, "" "{" NL,
NL, " for(int k = 0; k < n; k++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " for(int i = 0; i < n; i++)" "" NL,
NL, " {" " for(int k = 0; k < n; k++)" NL,
NL, " if (glob_size_arr[i] >= loc_size_arr[k])" " {" NL,
NL, " {" " for(int i = 0; i < n; i++)" NL,
NL, " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], loc_size_arr[k]);" " {" NL,
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" " if (glob_size_arr[i] >= loc_size_arr[k])" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " {" NL,
NL, " }" " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], "
NL, " }" "loc_size_arr[k]);" NL,
NL, " }" " int enq_res = enqueue_kernel(get_default_queue(), "
NL, "}" "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
NL " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_1d_ofs[] = static const char *helper_ndrange_1d_ofs[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[(get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[(get_global_offset(0) + "
NL, "}" "get_global_linear_id()) % len], 1u, memory_order_relaxed, "
NL, "" "memory_scope_device);" NL,
NL, "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)" "}" NL,
NL, "{" "" NL,
NL, " size_t tid = get_global_id(0);" "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global "
NL, "" "atomic_uint* val, __global uint* ofs_arr)" NL,
NL, " for(int l = 0; l < n; l++)" "{" NL,
NL, " {" " size_t tid = get_global_id(0);" NL,
NL, " for(int k = 0; k < n; k++)" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " {" "" NL,
NL, " for(int i = 0; i < n; i++)" " for(int l = 0; l < n; l++)" NL,
NL, " {" " {" NL,
NL, " if (glob_size_arr[i] >= loc_size_arr[k])" " for(int k = 0; k < n; k++)" NL,
NL, " {" " {" NL,
NL, " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], loc_size_arr[k]);" " for(int i = 0; i < n; i++)" NL,
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" " {" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " if (glob_size_arr[i] >= loc_size_arr[k])" NL,
NL, " }" " {" NL,
NL, " }" " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], "
NL, " }" "loc_size_arr[k]);" NL,
NL, " }" " int enq_res = enqueue_kernel(get_default_queue(), "
NL, "}" "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
NL " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_2d_glo[] = static const char *helper_ndrange_2d_glo[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
NL, "}" "memory_order_relaxed, memory_scope_device);" NL,
NL, "" "}" NL,
NL, "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" "" NL,
NL, "{" "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, "
NL, " size_t tid = get_global_id(0);" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "val, __global uint* ofs_arr)" NL,
NL, "" "{" NL,
NL, " for(int i = 0; i < n; i++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };" "" NL,
NL, " ndrange_t ndrange = ndrange_2D(glob_size);" " for(int i = 0; i < n; i++)" NL,
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" " {" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] "
NL, " }" "};" NL,
NL, "}" " ndrange_t ndrange = ndrange_2D(glob_size);" NL,
NL " int enq_res = enqueue_kernel(get_default_queue(), "
"CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
" if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_2d_loc[] = static const char *helper_ndrange_2d_loc[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
NL, "}" "memory_order_relaxed, memory_scope_device);" NL,
NL, "" "}" NL,
NL, "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" "" NL,
NL, "{" "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, "
NL, " size_t tid = get_global_id(0);" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "val, __global uint* ofs_arr)" NL,
NL, "" "{" NL,
NL, " for(int k = 0; k < n; k++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " for(int i = 0; i < n; i++)" "" NL,
NL, " {" " for(int k = 0; k < n; k++)" NL,
NL, " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" " {" NL,
NL, " {" " for(int i = 0; i < n; i++)" NL,
NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };" " {" NL,
NL, " size_t loc_size[] = { 1, loc_size_arr[k] };" " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
NL, "" " {" NL,
NL, " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" "n] };" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
NL, " }" "" NL,
NL, " }" " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL,
NL, " }" " int enq_res = enqueue_kernel(get_default_queue(), "
NL, "}" "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
NL " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_2d_ofs[] = static const char *helper_ndrange_2d_ofs[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[(get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[(get_global_offset(1) * "
NL, "}" "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
NL, "" "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
NL, "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" "}" NL,
NL, "{" "" NL,
NL, " size_t tid = get_global_id(0);" "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
NL, "" "val, __global uint* ofs_arr)" NL,
NL, " for(int l = 0; l < n; l++)" "{" NL,
NL, " {" " size_t tid = get_global_id(0);" NL,
NL, " for(int k = 0; k < n; k++)" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " {" "" NL,
NL, " for(int i = 0; i < n; i++)" " for(int l = 0; l < n; l++)" NL,
NL, " {" " {" NL,
NL, " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" " for(int k = 0; k < n; k++)" NL,
NL, " {" " {" NL,
NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n]};" " for(int i = 0; i < n; i++)" NL,
NL, " size_t loc_size[] = { 1, loc_size_arr[k] };" " {" NL,
NL, " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL,
NL, "" " {" NL,
NL, " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) "
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" "% n]};" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " size_t loc_size[] = { 1, loc_size_arr[k] };" NL,
NL, " }" " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL,
NL, " }" "" NL,
NL, " }" " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL,
NL, " }" " int enq_res = enqueue_kernel(get_default_queue(), "
NL, "}" "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
NL " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_3d_glo[] = static const char *helper_ndrange_3d_glo[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
NL, "}" "memory_order_relaxed, memory_scope_device);" NL,
NL, "" "}" NL,
NL, "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" "" NL,
NL, "{" "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, "
NL, " size_t tid = get_global_id(0);" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "val, __global uint* ofs_arr)" NL,
NL, "" "{" NL,
NL, " for(int i = 0; i < n; i++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];" "" NL,
NL, " if (global_work_size <= (len * len))" " for(int i = 0; i < n; i++)" NL,
NL, " {" " {" NL,
NL, " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };" " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % "
NL, " ndrange_t ndrange = ndrange_3D(glob_size);" "n] * glob_size_arr[(i + 2) % n];" NL,
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" " if (global_work_size <= (len * len))" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " {" NL,
NL, " }" " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
NL, " }" "n], glob_size_arr[(i + 2) % n] };" NL,
NL, "}" " ndrange_t ndrange = ndrange_3D(glob_size);" NL,
NL " int enq_res = enqueue_kernel(get_default_queue(), "
"CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
" if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_3d_loc[] = static const char *helper_ndrange_3d_loc[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, "
NL, "}" "memory_order_relaxed, memory_scope_device);" NL,
NL, "" "}" NL,
NL, "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" "" NL,
NL, "{" "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, "
NL, " size_t tid = get_global_id(0);" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "val, __global uint* ofs_arr)" NL,
NL, "" "{" NL,
NL, " for(int k = 0; k < n; k++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " for(int i = 0; i < n; i++)" "" NL,
NL, " {" " for(int k = 0; k < n; k++)" NL,
NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];" " {" NL,
NL, " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))" " for(int i = 0; i < n; i++)" NL,
NL, " {" " {" NL,
NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };" " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % "
NL, " size_t loc_size[] = { 1, 1, loc_size_arr[k] };" "n] * glob_size_arr[(i + 2) % n];" NL,
NL, " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" "global_work_size <= (len * len))" NL,
NL, " " " {" NL,
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % "
NL, " }" "n], glob_size_arr[(i + 2) % n] };" NL,
NL, " }" " size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL,
NL, " }" " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL,
NL, "}" " int enq_res = enqueue_kernel(get_default_queue(), "
NL "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
" " NL,
" if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const char* helper_ndrange_3d_ofs[] = static const char *helper_ndrange_3d_ofs[] = {
{ NL,
NL, "void block_fn(int len, __global atomic_uint* val)" "void block_fn(int len, __global atomic_uint* val)" NL,
NL, "{" "{" NL,
NL, " atomic_fetch_add_explicit(&val[(get_global_offset(2) * get_global_size(0) * get_global_size(1) + get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);" " atomic_fetch_add_explicit(&val[(get_global_offset(2) * "
NL, "}" "get_global_size(0) * get_global_size(1) + get_global_offset(1) * "
NL, "" "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % "
NL, "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" "len], 1u, memory_order_relaxed, memory_scope_device);" NL,
NL, "{" "}" NL,
NL, " size_t tid = get_global_id(0);" "" NL,
NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, "
NL, "" "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* "
NL, " for(int l = 0; l < n; l++)" "val, __global uint* ofs_arr)" NL,
NL, " {" "{" NL,
NL, " for(int k = 0; k < n; k++)" " size_t tid = get_global_id(0);" NL,
NL, " {" " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL,
NL, " for(int i = 0; i < n; i++)" "" NL,
NL, " {" " for(int l = 0; l < n; l++)" NL,
NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];" " {" NL,
NL, " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))" " for(int k = 0; k < n; k++)" NL,
NL, " {" " {" NL,
NL, " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n]};" " for(int i = 0; i < n; i++)" NL,
NL, " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" " {" NL,
NL, " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l + 2) % n] };" " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) "
NL, " ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" "% n] * glob_size_arr[(i + 2) % n];" NL,
NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && "
NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" "global_work_size <= (len * len))" NL,
NL, " }" " {" NL,
NL, " }" " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) "
NL, " }" "% n], glob_size_arr[(i + 2) % n]};" NL,
NL, " }" " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL,
NL, "}" " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l "
NL "+ 2) % n] };" NL,
" ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL,
" int enq_res = enqueue_kernel(get_default_queue(), "
"CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL,
" if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL,
" }" NL,
" }" NL,
" }" NL,
" }" NL,
"}" NL
}; };
static const kernel_src_dim_check sources_ndrange_Xd[] = static const kernel_src_dim_check sources_ndrange_Xd[] =