diff --git a/test_conformance/SVM/test_fine_grain_memory_consistency.cpp b/test_conformance/SVM/test_fine_grain_memory_consistency.cpp index 42ea0bd2..b28db411 100644 --- a/test_conformance/SVM/test_fine_grain_memory_consistency.cpp +++ b/test_conformance/SVM/test_fine_grain_memory_consistency.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -16,27 +16,33 @@ #include "common.h" static char hash_table_kernel[] = - "#if 0\n" - "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" - "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n" - "#endif\n" - "typedef struct BinNode {\n" - " int value;\n" - " atomic_uintptr_t pNext;\n" - "} BinNode;\n" + "#if 0\n" + "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n" + "#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable\n" + "#endif\n" + "typedef struct BinNode {\n" + " int value;\n" + " atomic_uintptr_t pNext;\n" + "} BinNode;\n" - "__kernel void build_hash_table(__global uint* input, __global BinNode* pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n" - "{\n" - " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, 1, memory_order_relaxed, memory_scope_all_svm_devices) ];\n" - " uint i = get_global_id(0);\n" - " uint b = input[i] % numBins;\n" - " pNew->value = input[i];\n" - " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), memory_order_seq_cst, memory_scope_all_svm_devices);\n" - " do\n" - " {\n" - " atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, memory_scope_all_svm_devices);\n" // always inserting at head of list - " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), &next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, memory_scope_all_svm_devices));\n" - "}\n"; + "__kernel void build_hash_table(__global uint* input, __global BinNode* " + "pNodes, volatile __global atomic_uint* pNumNodes, uint numBins)\n" + "{\n" + " __global BinNode *pNew = &pNodes[ atomic_fetch_add_explicit(pNumNodes, " + "1u, memory_order_relaxed, memory_scope_all_svm_devices) ];\n" + " uint i = get_global_id(0);\n" + " uint b = input[i] % numBins;\n" + " pNew->value = input[i];\n" + " uintptr_t next = atomic_load_explicit(&(pNodes[b].pNext), " + "memory_order_seq_cst, memory_scope_all_svm_devices);\n" + " do\n" + " {\n" + " atomic_store_explicit(&(pNew->pNext), next, memory_order_seq_cst, " + "memory_scope_all_svm_devices);\n" // always inserting at head of list + " } while(!atomic_compare_exchange_strong_explicit(&(pNodes[b].pNext), " + "&next, (uintptr_t)pNew, memory_order_seq_cst, memory_order_relaxed, " + "memory_scope_all_svm_devices));\n" + "}\n"; typedef struct BinNode{ cl_uint value; diff --git a/test_conformance/SVM/test_fine_grain_sync_buffers.cpp b/test_conformance/SVM/test_fine_grain_sync_buffers.cpp index 4cc34952..0b94cbf2 100644 --- a/test_conformance/SVM/test_fine_grain_sync_buffers.cpp +++ b/test_conformance/SVM/test_fine_grain_sync_buffers.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -17,15 +17,19 @@ const char *find_targets_kernel[] = { - "__kernel void find_targets(__global uint* image, uint target, volatile __global atomic_uint *numTargetsFound, volatile __global atomic_uint *targetLocations)\n" - "{\n" - " size_t i = get_global_id(0);\n" - " uint index;\n" - " if(image[i] == target) {\n" - " index = atomic_fetch_add_explicit(numTargetsFound, 1, memory_order_relaxed, memory_scope_device); \n" - " atomic_exchange_explicit(&targetLocations[index], i, memory_order_relaxed, memory_scope_all_svm_devices); \n" - " }\n" - "}\n" + "__kernel void find_targets(__global uint* image, uint target, volatile " + "__global atomic_uint *numTargetsFound, volatile __global atomic_uint " + "*targetLocations)\n" + "{\n" + " size_t i = get_global_id(0);\n" + " uint index;\n" + " if(image[i] == target) {\n" + " index = atomic_fetch_add_explicit(numTargetsFound, 1u, " + "memory_order_relaxed, memory_scope_device); \n" + " atomic_exchange_explicit(&targetLocations[index], i, " + "memory_order_relaxed, memory_scope_all_svm_devices); \n" + " }\n" + "}\n" }; diff --git a/test_conformance/device_execution/enqueue_ndrange.cpp b/test_conformance/device_execution/enqueue_ndrange.cpp index 84ac339f..8ced6629 100644 --- a/test_conformance/device_execution/enqueue_ndrange.cpp +++ b/test_conformance/device_execution/enqueue_ndrange.cpp @@ -1,6 +1,6 @@ // // Copyright (c) 2017 The Khronos Group Inc. -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at @@ -27,271 +27,316 @@ #ifdef CL_VERSION_2_0 extern int gWimpyMode; -static const char* helper_ndrange_1d_glo[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_1d_glo[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, " + "memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_1d_glo(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global " + "atomic_uint* val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " ndrange_t ndrange = ndrange_1D(glob_size_arr[i]);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_1d_loc[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int k = 0; k < n; k++)" - NL, " {" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " if (glob_size_arr[i] >= loc_size_arr[k])" - NL, " {" - NL, " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], loc_size_arr[k]);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_1d_loc[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, " + "memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_1d_loc(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global " + "atomic_uint* val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int k = 0; k < n; k++)" NL, + " {" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " if (glob_size_arr[i] >= loc_size_arr[k])" NL, + " {" NL, + " ndrange_t ndrange = ndrange_1D(glob_size_arr[i], " + "loc_size_arr[k]);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_1d_ofs[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[(get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global atomic_uint* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int l = 0; l < n; l++)" - NL, " {" - NL, " for(int k = 0; k < n; k++)" - NL, " {" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " if (glob_size_arr[i] >= loc_size_arr[k])" - NL, " {" - NL, " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], loc_size_arr[k]);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_1d_ofs[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[(get_global_offset(0) + " + "get_global_linear_id()) % len], 1u, memory_order_relaxed, " + "memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_1d_ofs(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global " + "atomic_uint* val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int l = 0; l < n; l++)" NL, + " {" NL, + " for(int k = 0; k < n; k++)" NL, + " {" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " if (glob_size_arr[i] >= loc_size_arr[k])" NL, + " {" NL, + " ndrange_t ndrange = ndrange_1D(ofs_arr[l], glob_size_arr[i], " + "loc_size_arr[k]);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + " }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_2d_glo[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };" - NL, " ndrange_t ndrange = ndrange_2D(glob_size);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_2d_glo[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, " + "memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_2d_glo(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* " + "val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " size_t glob_size[2] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] " + "};" NL, + " ndrange_t ndrange = ndrange_2D(glob_size);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_2d_loc[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int k = 0; k < n; k++)" - NL, " {" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" - NL, " {" - NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n] };" - NL, " size_t loc_size[] = { 1, loc_size_arr[k] };" - NL, "" - NL, " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_2d_loc[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, " + "memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_2d_loc(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* " + "val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int k = 0; k < n; k++)" NL, + " {" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL, + " {" NL, + " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % " + "n] };" NL, + " size_t loc_size[] = { 1, loc_size_arr[k] };" NL, + "" NL, + " ndrange_t ndrange = ndrange_2D(glob_size, loc_size);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_2d_ofs[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[(get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int l = 0; l < n; l++)" - NL, " {" - NL, " for(int k = 0; k < n; k++)" - NL, " {" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" - NL, " {" - NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n]};" - NL, " size_t loc_size[] = { 1, loc_size_arr[k] };" - NL, " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" - NL, "" - NL, " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_2d_ofs[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[(get_global_offset(1) * " + "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % " + "len], 1u, memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_2d_ofs(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* " + "val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int l = 0; l < n; l++)" NL, + " {" NL, + " for(int k = 0; k < n; k++)" NL, + " {" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " if (glob_size_arr[(i + 1) % n] >= loc_size_arr[k])" NL, + " {" NL, + " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) " + "% n]};" NL, + " size_t loc_size[] = { 1, loc_size_arr[k] };" NL, + " size_t ofs[] = { ofs_arr[l], ofs_arr[(l + 1) % n] };" NL, + "" NL, + " ndrange_t ndrange = ndrange_2D(ofs,glob_size,loc_size);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + " }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_3d_glo[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];" - NL, " if (global_work_size <= (len * len))" - NL, " {" - NL, " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };" - NL, " ndrange_t ndrange = ndrange_3D(glob_size);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_3d_glo[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, " + "memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_3d_glo(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* " + "val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % " + "n] * glob_size_arr[(i + 2) % n];" NL, + " if (global_work_size <= (len * len))" NL, + " {" NL, + " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % " + "n], glob_size_arr[(i + 2) % n] };" NL, + " ndrange_t ndrange = ndrange_3D(glob_size);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_3d_loc[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int k = 0; k < n; k++)" - NL, " {" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];" - NL, " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))" - NL, " {" - NL, " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n] };" - NL, " size_t loc_size[] = { 1, 1, loc_size_arr[k] };" - NL, " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " " - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_3d_loc[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[get_global_linear_id() % len], 1u, " + "memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_3d_loc(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* " + "val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int k = 0; k < n; k++)" NL, + " {" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % " + "n] * glob_size_arr[(i + 2) % n];" NL, + " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && " + "global_work_size <= (len * len))" NL, + " {" NL, + " size_t glob_size[] = { glob_size_arr[i], glob_size_arr[(i + 1) % " + "n], glob_size_arr[(i + 2) % n] };" NL, + " size_t loc_size[] = { 1, 1, loc_size_arr[k] };" NL, + " ndrange_t ndrange = ndrange_3D(glob_size,loc_size);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " " NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + " }" NL, + "}" NL }; -static const char* helper_ndrange_3d_ofs[] = -{ - NL, "void block_fn(int len, __global atomic_uint* val)" - NL, "{" - NL, " atomic_fetch_add_explicit(&val[(get_global_offset(2) * get_global_size(0) * get_global_size(1) + get_global_offset(1) * get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % len], 1, memory_order_relaxed, memory_scope_device);" - NL, "}" - NL, "" - NL, "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, __global uint* glob_size_arr, __global uint* loc_size_arr, __global int* val, __global uint* ofs_arr)" - NL, "{" - NL, " size_t tid = get_global_id(0);" - NL, " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" - NL, "" - NL, " for(int l = 0; l < n; l++)" - NL, " {" - NL, " for(int k = 0; k < n; k++)" - NL, " {" - NL, " for(int i = 0; i < n; i++)" - NL, " {" - NL, " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) % n] * glob_size_arr[(i + 2) % n];" - NL, " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && global_work_size <= (len * len))" - NL, " {" - NL, " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) % n], glob_size_arr[(i + 2) % n]};" - NL, " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" - NL, " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l + 2) % n] };" - NL, " ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" - NL, " int enq_res = enqueue_kernel(get_default_queue(), CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" - NL, " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" - NL, " }" - NL, " }" - NL, " }" - NL, " }" - NL, "}" - NL +static const char *helper_ndrange_3d_ofs[] = { + NL, + "void block_fn(int len, __global atomic_uint* val)" NL, + "{" NL, + " atomic_fetch_add_explicit(&val[(get_global_offset(2) * " + "get_global_size(0) * get_global_size(1) + get_global_offset(1) * " + "get_global_size(0) + get_global_offset(0) + get_global_linear_id()) % " + "len], 1u, memory_order_relaxed, memory_scope_device);" NL, + "}" NL, + "" NL, + "kernel void helper_ndrange_3d_ofs(__global int* res, uint n, uint len, " + "__global uint* glob_size_arr, __global uint* loc_size_arr, __global int* " + "val, __global uint* ofs_arr)" NL, + "{" NL, + " size_t tid = get_global_id(0);" NL, + " void (^kernelBlock)(void) = ^{ block_fn(len, val); };" NL, + "" NL, + " for(int l = 0; l < n; l++)" NL, + " {" NL, + " for(int k = 0; k < n; k++)" NL, + " {" NL, + " for(int i = 0; i < n; i++)" NL, + " {" NL, + " uint global_work_size = glob_size_arr[i] * glob_size_arr[(i + 1) " + "% n] * glob_size_arr[(i + 2) % n];" NL, + " if (glob_size_arr[(i + 2) % n] >= loc_size_arr[k] && " + "global_work_size <= (len * len))" NL, + " {" NL, + " size_t glob_size[3] = { glob_size_arr[i], glob_size_arr[(i + 1) " + "% n], glob_size_arr[(i + 2) % n]};" NL, + " size_t loc_size[3] = { 1, 1, loc_size_arr[k] };" NL, + " size_t ofs[3] = { ofs_arr[l], ofs_arr[(l + 1) % n], ofs_arr[(l " + "+ 2) % n] };" NL, + " ndrange_t ndrange = ndrange_3D(ofs,glob_size,loc_size);" NL, + " int enq_res = enqueue_kernel(get_default_queue(), " + "CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, kernelBlock);" NL, + " if(enq_res != CLK_SUCCESS) { res[tid] = -1; return; }" NL, + " }" NL, + " }" NL, + " }" NL, + " }" NL, + "}" NL }; static const kernel_src_dim_check sources_ndrange_Xd[] =