diff --git a/test_conformance/workgroups/CMakeLists.txt b/test_conformance/workgroups/CMakeLists.txt index c90bef88..0c004b32 100644 --- a/test_conformance/workgroups/CMakeLists.txt +++ b/test_conformance/workgroups/CMakeLists.txt @@ -5,15 +5,7 @@ set(${MODULE_NAME}_SOURCES test_wg_all.cpp test_wg_any.cpp test_wg_broadcast.cpp - test_wg_reduce.cpp - test_wg_reduce_max.cpp - test_wg_reduce_min.cpp - test_wg_scan_exclusive_add.cpp - test_wg_scan_exclusive_min.cpp - test_wg_scan_exclusive_max.cpp - test_wg_scan_inclusive_add.cpp - test_wg_scan_inclusive_min.cpp - test_wg_scan_inclusive_max.cpp + test_wg_scan_reduce.cpp test_wg_suggested_local_work_size.cpp ) diff --git a/test_conformance/workgroups/test_wg_reduce.cpp b/test_conformance/workgroups/test_wg_reduce.cpp deleted file mode 100644 index eb26f498..00000000 --- a/test_conformance/workgroups/test_wg_reduce.cpp +++ /dev/null @@ -1,596 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "harness/compat.h" - -#include -#include -#include -#include - -#include "procs.h" - - -const char *wg_reduce_add_kernel_code_int = -"__kernel void test_wg_reduce_add_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_add_kernel_code_uint = -"__kernel void test_wg_reduce_add_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_reduce_add_kernel_code_long = -"__kernel void test_wg_reduce_add_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_add_kernel_code_ulong = -"__kernel void test_wg_reduce_add_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_reduce_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_reduce_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add int: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add uint: Error at %u: expected = %d, got = %d\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add long: Error at %u: expected = %lld, got = %lld\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - sum += inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( sum != outptr[i+j] ) - { - log_info("work_group_reduce_add ulong: Error at %u: expected = %llu, got = %llu\n", i+j, sum, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - - - -int -test_work_group_reduce_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_add_kernel_code_int, - "test_wg_reduce_add_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include "procs.h" - - -const char *wg_reduce_max_kernel_code_int = -"__kernel void test_wg_reduce_max_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_max_kernel_code_uint = -"__kernel void test_wg_reduce_max_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_reduce_max_kernel_code_long = -"__kernel void test_wg_reduce_max_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_max_kernel_code_ulong = -"__kernel void test_wg_reduce_max_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_reduce_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_reduce_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max int: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max uint: Error at %u: expected = %d, got = %d\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max long: Error at %u: expected = %lld, got = %lld\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - max = (max > inptr[i+j]) ? max : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( max != outptr[i+j] ) - { - log_info("work_group_reduce_max ulong: Error at %u: expected = %llu, got = %llu\n", i+j, max, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - - - -int -test_work_group_reduce_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_max_kernel_code_int, - "test_wg_reduce_max_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include "procs.h" - - -const char *wg_reduce_min_kernel_code_int = -"__kernel void test_wg_reduce_min_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_min_kernel_code_uint = -"__kernel void test_wg_reduce_min_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_reduce_min_kernel_code_long = -"__kernel void test_wg_reduce_min_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_reduce_min_kernel_code_ulong = -"__kernel void test_wg_reduce_min_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_reduce_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_reduce_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min int: Error at %u: expected = %d, got = %d\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min uint: Error at %u: expected = %d, got = %d\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min long: Error at %u: expected = %lld, got = %lld\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_reduce_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) -{ - size_t i, j; - - for (i=0; i wg_size ? wg_size : (n-i)); j++) - min = (min < inptr[i+j]) ? min : inptr[i+j]; - - for (j=0; j<((n-i) > wg_size ? wg_size : (n-i)); j++) - { - if ( min != outptr[i+j] ) - { - log_info("work_group_reduce_min ulong: Error at %u: expected = %llu, got = %llu\n", i+j, min, outptr[i+j]); - return -1; - } - } - } - - return 0; -} - - - -int -test_work_group_reduce_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_reduce_min_kernel_code_int, - "test_wg_reduce_min_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include "procs.h" - - -const char *wg_scan_exclusive_add_kernel_code_int = -"__kernel void test_wg_scan_exclusive_add_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_add_kernel_code_uint = -"__kernel void test_wg_scan_exclusive_add_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_exclusive_add_kernel_code_long = -"__kernel void test_wg_scan_exclusive_add_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_add_kernel_code_ulong = -"__kernel void test_wg_scan_exclusive_add_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_exclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_exclusive_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - int s, lasts; - - - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - - s = 0; - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add int: Error at %u: expected = %d, got = %d\n", - (unsigned int)(j + i), lasts, outptr[j + i]); - return -1; - } - lasts = s; - } - } - return 0; -} - -static int -verify_wg_scan_exclusive_add_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - unsigned int s, lasts; - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - s = 0; - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add uint: Error at %u: expected = %u, got = %u\n", - (unsigned int)(j + i), lasts, outptr[j + i]); - return -1; - } - lasts = s; - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_add_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - cl_long s, lasts; - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - s = 0; - - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add long: Error at %u: expected = %lld, got = %lld\n", - (unsigned int)(j + i), (long long)lasts, (long long)outptr[j + i]); - return -1; - } - lasts = s; - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_add_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - cl_ulong s, lasts; - - for (j = 0; j < n; j += wg_size) { - m = n - j; - if (m > wg_size) m = wg_size; - - s = 0; - lasts = 0; - for (i = 0; i < m; ++i) { - s += inptr[j + i]; - if (outptr[j + i] != lasts) { - log_info("work_group_scan_exclusive_add ulong: Error at %u: expected = %llu, got = %llu\n", - (unsigned int)(j + i), (unsigned long long)lasts, (unsigned long long)outptr[j + i]); - return -1; - } - lasts = s; - } - } - return 0; -} - - -int -test_work_group_scan_exclusive_add_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_add_kernel_code_int, - "test_wg_scan_exclusive_add_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include - -#include "procs.h" - -const char *wg_scan_exclusive_max_kernel_code_int = -"__kernel void test_wg_scan_exclusive_max_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_max_kernel_code_uint = -"__kernel void test_wg_scan_exclusive_max_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_exclusive_max_kernel_code_long = -"__kernel void test_wg_scan_exclusive_max_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_max_kernel_code_ulong = -"__kernel void test_wg_scan_exclusive_max_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_exclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_exclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = std::max(inptr[j + i], max_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = std::max(inptr[j + i], max_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = std::max(inptr[j + i], max_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != max_) { - log_info("work_group_scan_exclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - max_ = std::max(inptr[j + i], max_); - } - } - - return 0; -} - - -int -test_work_group_scan_exclusive_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_max_kernel_code_int, - "test_wg_scan_exclusive_max_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include - -#include "procs.h" - -const char *wg_scan_exclusive_min_kernel_code_int = -"__kernel void test_wg_scan_exclusive_min_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_min_kernel_code_uint = -"__kernel void test_wg_scan_exclusive_min_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_exclusive_min_kernel_code_long = -"__kernel void test_wg_scan_exclusive_min_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_exclusive_min_kernel_code_ulong = -"__kernel void test_wg_scan_exclusive_min_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_exclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - - -static int -verify_wg_scan_exclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - min_ = std::min(inptr[j + i], min_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min int: Error at %u: expected = %u, got = %u\n", j+i, min_, outptr[j+i]); - return -1; - } - min_ = std::min(inptr[j + i], min_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - min_ = std::min(inptr[j + i], min_); - } - } - - return 0; -} - -static int -verify_wg_scan_exclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - if (outptr[j+i] != min_) { - log_info("work_group_scan_exclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - min_ = std::min(inptr[j + i], min_); - } - } - - return 0; -} - - -int -test_work_group_scan_exclusive_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t wg_sizes_per_dimension[3]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_exclusive_min_kernel_code_int, - "test_wg_scan_exclusive_min_int"); - if (err) - return -1; - - err = clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), wg_size, NULL); - if (err) - return -1; - - err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, wg_sizes_per_dimension, NULL); - if (err) - return -1; - if(wg_sizes_per_dimension[0] < wg_size[0]) - { - wg_size[0] = wg_sizes_per_dimension[0]; - } - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include "procs.h" - - -const char *wg_scan_inclusive_add_kernel_code_int = -"__kernel void test_wg_scan_inclusive_add_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_add_kernel_code_uint = -"__kernel void test_wg_scan_inclusive_add_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_inclusive_add_kernel_code_long = -"__kernel void test_wg_scan_inclusive_add_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_add_kernel_code_ulong = -"__kernel void test_wg_scan_inclusive_add_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_inclusive_add(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_inclusive_add_int(int *inptr, int *outptr, size_t n, size_t wg_size) -{ - size_t i, j, m; - int s; - - for (j=0; j wg_size) - m = wg_size; - - s = 0; - for (i=0; i wg_size) - m = wg_size; - - s = 0; - for (i=0; i wg_size) - m = wg_size; - - s = 0; - for (i=0; i wg_size) - m = wg_size; - - s = 0; - for (i=0; i -#include -#include -#include - -#include - -#include "procs.h" - - -const char *wg_scan_inclusive_max_kernel_code_int = -"__kernel void test_wg_scan_inclusive_max_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_max_kernel_code_uint = -"__kernel void test_wg_scan_inclusive_max_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_inclusive_max_kernel_code_long = -"__kernel void test_wg_scan_inclusive_max_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_max_kernel_code_ulong = -"__kernel void test_wg_scan_inclusive_max_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_inclusive_max(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_inclusive_max_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = std::max(inptr[j + i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_max_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = std::max(inptr[j + i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max int: Error at %lu: expected = %u, got = %u\n", (unsigned long)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_max_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = std::max(inptr[j + i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_max_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - max_ = std::max(inptr[j + i], max_); - if (outptr[j+i] != max_) { - log_info("work_group_scan_inclusive_max ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), max_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - - -int -test_work_group_scan_inclusive_max_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_max_kernel_code_int, - "test_wg_scan_inclusive_max_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i -#include -#include -#include - -#include - -#include "procs.h" - - -const char *wg_scan_inclusive_min_kernel_code_int = -"__kernel void test_wg_scan_inclusive_min_int(global int *input, global int *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" int result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_min_kernel_code_uint = -"__kernel void test_wg_scan_inclusive_min_uint(global uint *input, global uint *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" uint result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - -const char *wg_scan_inclusive_min_kernel_code_long = -"__kernel void test_wg_scan_inclusive_min_long(global long *input, global long *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" long result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -const char *wg_scan_inclusive_min_kernel_code_ulong = -"__kernel void test_wg_scan_inclusive_min_ulong(global ulong *input, global ulong *output)\n" -"{\n" -" int tid = get_global_id(0);\n" -"\n" -" ulong result = work_group_scan_inclusive_min(input[tid]);\n" -" output[tid] = result;\n" -"}\n"; - - -static int -verify_wg_scan_inclusive_min_int(int *inptr, int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = std::min(inptr[j + i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min int: Error at %u: expected = %d, got = %d\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_min_uint(unsigned int *inptr, unsigned int *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = std::min(inptr[j + i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min int: Error at %u: expected = %u, got = %u\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_min_long(cl_long *inptr, cl_long *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = std::min(inptr[j + i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min long: Error at %u: expected = %lld, got = %lld\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - -static int -verify_wg_scan_inclusive_min_ulong(cl_ulong *inptr, cl_ulong *outptr, size_t n, size_t wg_size) { - - size_t i, j, m; - - for (j=0; j wg_size) - m = wg_size; - - for (i = 0; i < m; ++i) { - min_ = std::min(inptr[j + i], min_); - if (outptr[j+i] != min_) { - log_info("work_group_scan_inclusive_min ulong: Error at %u: expected = %llu, got = %llu\n", (unsigned int)(j+i), min_, outptr[j+i]); - return -1; - } - } - } - - return 0; -} - - -int -test_work_group_scan_inclusive_min_int(cl_device_id device, cl_context context, cl_command_queue queue, int n_elems) -{ - cl_mem streams[2]; - cl_int *input_ptr[1], *p; - cl_int *output_ptr; - cl_program program; - cl_kernel kernel; - void *values[2]; - size_t threads[1]; - size_t wg_size[1]; - size_t num_elements; - int err; - int i; - MTdata d; - - err = create_single_kernel_helper(context, &program, &kernel, 1, - &wg_scan_inclusive_min_kernel_code_int, - "test_wg_scan_inclusive_min_int"); - if (err) - return -1; - - // "wg_size" is limited to that of the first dimension as only a 1DRange is executed. - err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); - test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); - - num_elements = n_elems; - - input_ptr[0] = (cl_int*)malloc(sizeof(cl_int) * num_elements); - output_ptr = (cl_int*)malloc(sizeof(cl_int) * num_elements); - streams[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[0]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - streams[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, - sizeof(cl_int) * num_elements, NULL, NULL); - if (!streams[1]) - { - log_error("clCreateBuffer failed\n"); - return -1; - } - - p = input_ptr[0]; - d = init_genrand( gRandomSeed ); - for (i=0; i +#include +#include + +#include "procs.h" + +static std::string make_kernel_string(const std::string &type, + const std::string &kernelName, + const std::string &func) +{ + // Build a kernel string of the form: + // __kernel void KERNEL_NAME(global TYPE *input, global TYPE *output) { + // int tid = get_global_id(0); + // output[tid] = FUNC(input[tid]); + // } + + std::ostringstream os; + os << "__kernel void " << kernelName << "(global " << type + << " *input, global " << type << " *output) {\n"; + os << " int tid = get_global_id(0);\n"; + os << " output[tid] = " << func << "(input[tid]);\n"; + os << "}\n"; + return os.str(); +} + +template struct TestTypeInfo +{ +}; + +template <> struct TestTypeInfo +{ + static constexpr const char *deviceName = "int"; +}; + +template <> struct TestTypeInfo +{ + static constexpr const char *deviceName = "uint"; +}; + +template <> struct TestTypeInfo +{ + static constexpr const char *deviceName = "long"; +}; + +template <> struct TestTypeInfo +{ + static constexpr const char *deviceName = "ulong"; +}; + +template struct Add +{ + using Type = T; + static constexpr const char *opName = "add"; + static constexpr T identityValue = 0; + static T combine(T a, T b) { return a + b; } +}; + +template struct Max +{ + using Type = T; + static constexpr const char *opName = "max"; + static constexpr T identityValue = std::numeric_limits::min(); + static T combine(T a, T b) { return std::max(a, b); } +}; + +template struct Min +{ + using Type = T; + static constexpr const char *opName = "min"; + static constexpr T identityValue = std::numeric_limits::max(); + static T combine(T a, T b) { return std::min(a, b); } +}; + +template struct Reduce +{ + using Type = typename C::Type; + + static constexpr const char *testName = "work_group_reduce"; + static constexpr const char *testOpName = C::opName; + static constexpr const char *deviceTypeName = + TestTypeInfo::deviceName; + static constexpr const char *kernelName = "test_wg_reduce"; + static int verify(Type *inptr, Type *outptr, size_t n_elems, + size_t max_wg_size) + { + for (size_t i = 0; i < n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + + Type result = C::identityValue; + for (size_t j = 0; j < wg_size; j++) + { + result = C::combine(result, inptr[i + j]); + } + + for (size_t j = 0; j < wg_size; j++) + { + if (result != outptr[i + j]) + { + log_info("%s_%s: Error at %zu\n", testName, testOpName, + i + j); + return -1; + } + } + } + return 0; + } +}; + +template struct ScanInclusive +{ + using Type = typename C::Type; + + static constexpr const char *testName = "work_group_scan_inclusive"; + static constexpr const char *testOpName = C::opName; + static constexpr const char *deviceTypeName = + TestTypeInfo::deviceName; + static constexpr const char *kernelName = "test_wg_scan_inclusive"; + static int verify(Type *inptr, Type *outptr, size_t n_elems, + size_t max_wg_size) + { + for (size_t i = 0; i < n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + + Type result = C::identityValue; + for (size_t j = 0; j < wg_size; ++j) + { + result = C::combine(result, inptr[i + j]); + if (result != outptr[i + j]) + { + log_info("%s_%s: Error at %zu\n", testName, testOpName, + i + j); + return -1; + } + } + } + return 0; + } +}; + +template struct ScanExclusive +{ + using Type = typename C::Type; + + static constexpr const char *testName = "work_group_scan_exclusive"; + static constexpr const char *testOpName = C::opName; + static constexpr const char *deviceTypeName = + TestTypeInfo::deviceName; + static constexpr const char *kernelName = "test_wg_scan_exclusive"; + static int verify(Type *inptr, Type *outptr, size_t n_elems, + size_t max_wg_size) + { + for (size_t i = 0; i < n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + + Type result = C::identityValue; + for (size_t j = 0; j < wg_size; ++j) + { + if (result != outptr[i + j]) + { + log_info("%s_%s: Error at %zu\n", testName, testOpName, + i + j); + return -1; + } + result = C::combine(result, inptr[i + j]); + } + } + return 0; + } +}; + +template +static int run_test(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + using T = typename TestInfo::Type; + + cl_int err = CL_SUCCESS; + + clProgramWrapper program; + clKernelWrapper kernel; + + std::string funcName = TestInfo::testName; + funcName += "_"; + funcName += TestInfo::testOpName; + + std::string kernelName = TestInfo::kernelName; + kernelName += "_"; + kernelName += TestInfo::testOpName; + kernelName += "_"; + kernelName += TestInfo::deviceTypeName; + + std::string kernelString = + make_kernel_string(TestInfo::deviceTypeName, kernelName, funcName); + + const char *kernel_source = kernelString.c_str(); + err = create_single_kernel_helper(context, &program, &kernel, 1, + &kernel_source, kernelName.c_str()); + test_error(err, "Unable to create test kernel"); + + size_t wg_size[1]; + err = get_max_allowed_1d_work_group_size_on_device(device, kernel, wg_size); + test_error(err, "get_max_allowed_1d_work_group_size_on_device failed"); + + clMemWrapper src = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(T) * n_elems, NULL, &err); + test_error(err, "Unable to create source buffer"); + + clMemWrapper dst = clCreateBuffer(context, CL_MEM_READ_WRITE, + sizeof(T) * n_elems, NULL, &err); + test_error(err, "Unable to create destination buffer"); + + std::vector input_ptr(n_elems); + + MTdataHolder d(gRandomSeed); + for (int i = 0; i < n_elems; i++) + { + input_ptr[i] = (T)genrand_int64(d); + } + + err = clEnqueueWriteBuffer(queue, src, CL_TRUE, 0, sizeof(T) * n_elems, + input_ptr.data(), 0, NULL, NULL); + test_error(err, "clWriteBuffer to initialize src buffer failed"); + + err = clSetKernelArg(kernel, 0, sizeof(src), &src); + test_error(err, "Unable to set src buffer kernel arg"); + err |= clSetKernelArg(kernel, 1, sizeof(dst), &dst); + test_error(err, "Unable to set dst buffer kernel arg"); + + size_t global_work_size[] = { (size_t)n_elems }; + err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, + wg_size, 0, NULL, NULL); + test_error(err, "Unable to enqueue test kernel"); + + std::vector output_ptr(n_elems); + + cl_uint dead = 0xdeaddead; + memset_pattern4(output_ptr.data(), &dead, sizeof(T) * n_elems); + err = clEnqueueReadBuffer(queue, dst, CL_TRUE, 0, sizeof(T) * n_elems, + output_ptr.data(), 0, NULL, NULL); + test_error(err, "clEnqueueReadBuffer to read read dst buffer failed"); + + if (TestInfo::verify(input_ptr.data(), output_ptr.data(), n_elems, + wg_size[0])) + { + log_error("%s_%s %s failed\n", TestInfo::testName, TestInfo::testOpName, + TestInfo::deviceTypeName); + return TEST_FAIL; + } + + log_info("%s_%s %s passed\n", TestInfo::testName, TestInfo::testOpName, + TestInfo::deviceTypeName); + return TEST_PASS; +} + +int test_work_group_reduce_add(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= run_test>>(device, context, queue, n_elems); + result |= run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + } + + return result; +} + +int test_work_group_reduce_max(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= run_test>>(device, context, queue, n_elems); + result |= run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + } + + return result; +} + +int test_work_group_reduce_min(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= run_test>>(device, context, queue, n_elems); + result |= run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + } + + return result; +} + +int test_work_group_scan_inclusive_add(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test>>(device, context, queue, + n_elems); + result |= run_test>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_inclusive_max(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test>>(device, context, queue, + n_elems); + result |= run_test>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_inclusive_min(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test>>(device, context, queue, + n_elems); + result |= run_test>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_exclusive_add(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test>>(device, context, queue, + n_elems); + result |= run_test>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_exclusive_max(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test>>(device, context, queue, + n_elems); + result |= run_test>>(device, context, queue, + n_elems); + } + + return result; +} + +int test_work_group_scan_exclusive_min(cl_device_id device, cl_context context, + cl_command_queue queue, int n_elems) +{ + int result = TEST_PASS; + + result |= + run_test>>(device, context, queue, n_elems); + result |= + run_test>>(device, context, queue, n_elems); + + if (gHasLong) + { + result |= run_test>>(device, context, queue, + n_elems); + result |= run_test>>(device, context, queue, + n_elems); + } + + return result; +}