diff --git a/test_conformance/subgroups/CMakeLists.txt b/test_conformance/subgroups/CMakeLists.txt index eb6a6079..d48af9cc 100644 --- a/test_conformance/subgroups/CMakeLists.txt +++ b/test_conformance/subgroups/CMakeLists.txt @@ -5,8 +5,16 @@ set(${MODULE_NAME}_SOURCES test_barrier.cpp test_queries.cpp test_workitem.cpp - test_workgroup.cpp + test_subgroup.cpp test_ifp.cpp + test_subgroup_extended_types.cpp + subgroup_common_kernels.cpp + test_subgroup_non_uniform_vote.cpp + test_subgroup_non_uniform_arithmetic.cpp + test_subgroup_ballot.cpp + test_subgroup_clustered_reduce.cpp + test_subgroup_shuffle.cpp + test_subgroup_shuffle_relative.cpp ) include(../CMakeCommon.txt) diff --git a/test_conformance/subgroups/main.cpp b/test_conformance/subgroups/main.cpp index f9a9a9d5..44416dd7 100644 --- a/test_conformance/subgroups/main.cpp +++ b/test_conformance/subgroups/main.cpp @@ -27,12 +27,19 @@ test_definition test_list[] = { ADD_TEST_VERSION(sub_group_info_core, Version(2, 1)), ADD_TEST_VERSION(work_item_functions_ext, Version(2, 0)), ADD_TEST_VERSION(work_item_functions_core, Version(2, 1)), - ADD_TEST_VERSION(work_group_functions_ext, Version(2, 0)), - ADD_TEST_VERSION(work_group_functions_core, Version(2, 1)), + ADD_TEST_VERSION(subgroup_functions_ext, Version(2, 0)), + ADD_TEST_VERSION(subgroup_functions_core, Version(2, 1)), ADD_TEST_VERSION(barrier_functions_ext, Version(2, 0)), ADD_TEST_VERSION(barrier_functions_core, Version(2, 1)), ADD_TEST_VERSION(ifp_ext, Version(2, 0)), - ADD_TEST_VERSION(ifp_core, Version(2, 1)) + ADD_TEST_VERSION(ifp_core, Version(2, 1)), + ADD_TEST(subgroup_functions_extended_types), + ADD_TEST(subgroup_functions_non_uniform_vote), + ADD_TEST(subgroup_functions_non_uniform_arithmetic), + ADD_TEST(subgroup_functions_ballot), + ADD_TEST(subgroup_functions_clustered_reduce), + ADD_TEST(subgroup_functions_shuffle), + ADD_TEST(subgroup_functions_shuffle_relative) }; const int test_num = ARRAY_SIZE(test_list); diff --git a/test_conformance/subgroups/procs.h b/test_conformance/subgroups/procs.h index 3ebb13b5..d09e8242 100644 --- a/test_conformance/subgroups/procs.h +++ b/test_conformance/subgroups/procs.h @@ -37,14 +37,12 @@ extern int test_work_item_functions_core(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements); -extern int test_work_group_functions_ext(cl_device_id device, - cl_context context, - cl_command_queue queue, - int num_elements); -extern int test_work_group_functions_core(cl_device_id device, - cl_context context, - cl_command_queue queue, - int num_elements); +extern int test_subgroup_functions_ext(cl_device_id device, cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_core(cl_device_id device, cl_context context, + cl_command_queue queue, + int num_elements); extern int test_barrier_functions_ext(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements); extern int test_barrier_functions_core(cl_device_id device, cl_context context, @@ -56,5 +54,31 @@ extern int test_ifp_ext(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements); extern int test_ifp_core(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements); - +extern int test_subgroup_functions_extended_types(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_non_uniform_vote(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_non_uniform_arithmetic( + cl_device_id device, cl_context context, cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_ballot(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_clustered_reduce(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_shuffle(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); +extern int test_subgroup_functions_shuffle_relative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements); #endif /*_procs_h*/ diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp new file mode 100644 index 00000000..f8b24450 --- /dev/null +++ b/test_conformance/subgroups/subgroup_common_kernels.cpp @@ -0,0 +1,106 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "subgroup_common_kernels.h" + +const char* bcast_source = + "__kernel void test_bcast(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint which_sub_group_local_id = xy[gid].z;\n" + " out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n" + + "}\n"; + +const char* redadd_source = "__kernel void test_redadd(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_reduce_add(in[gid]);\n" + "}\n"; + +const char* redmax_source = "__kernel void test_redmax(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_reduce_max(in[gid]);\n" + "}\n"; + +const char* redmin_source = "__kernel void test_redmin(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_reduce_min(in[gid]);\n" + "}\n"; + +const char* scinadd_source = + "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, " + "__global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_scan_inclusive_add(in[gid]);\n" + "}\n"; + +const char* scinmax_source = + "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, " + "__global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_scan_inclusive_max(in[gid]);\n" + "}\n"; + +const char* scinmin_source = + "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, " + "__global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_scan_inclusive_min(in[gid]);\n" + "}\n"; + +const char* scexadd_source = + "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, " + "__global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_scan_exclusive_add(in[gid]);\n" + "}\n"; + +const char* scexmax_source = + "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, " + "__global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_scan_exclusive_max(in[gid]);\n" + "}\n"; + +const char* scexmin_source = + "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, " + "__global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_scan_exclusive_min(in[gid]);\n" + "}\n"; diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h new file mode 100644 index 00000000..8ae97d9a --- /dev/null +++ b/test_conformance/subgroups/subgroup_common_kernels.h @@ -0,0 +1,32 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef SUBGROUPKERNELSOURCES_H +#define SUBGROUPKERNELSOURCES_H +#include "subhelpers.h" + + +extern const char* bcast_source; +extern const char* redadd_source; +extern const char* redmax_source; +extern const char* redmin_source; +extern const char* scinadd_source; +extern const char* scinmax_source; +extern const char* scinmin_source; +extern const char* scexadd_source; +extern const char* scexmax_source; +extern const char* scexmin_source; + +#endif diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h new file mode 100644 index 00000000..b30c416b --- /dev/null +++ b/test_conformance/subgroups/subgroup_common_templates.h @@ -0,0 +1,911 @@ +// +// Copyright (c) 2020 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#ifndef SUBGROUPCOMMONTEMPLATES_H +#define SUBGROUPCOMMONTEMPLATES_H + +#include "typeWrappers.h" +#include +#include "CL/cl_half.h" +#include "subhelpers.h" + +#include + +typedef std::bitset<128> bs128; +static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id, + const std::string &mask_type, + cl_uint max_sub_group_size) +{ + bs128 mask128; + cl_uint4 mask; + cl_uint pos = subgroup_local_id; + if (mask_type == "eq") mask128.set(pos); + if (mask_type == "le" || mask_type == "lt") + { + for (cl_uint i = 0; i <= pos; i++) mask128.set(i); + if (mask_type == "lt") mask128.reset(pos); + } + if (mask_type == "ge" || mask_type == "gt") + { + for (cl_uint i = pos; i < max_sub_group_size; i++) mask128.set(i); + if (mask_type == "gt") mask128.reset(pos); + } + + // convert std::bitset<128> to uint4 + auto const uint_mask = bs128{ static_cast(-1) }; + mask.s0 = (mask128 & uint_mask).to_ulong(); + mask128 >>= 32; + mask.s1 = (mask128 & uint_mask).to_ulong(); + mask128 >>= 32; + mask.s2 = (mask128 & uint_mask).to_ulong(); + mask128 >>= 32; + mask.s3 = (mask128 & uint_mask).to_ulong(); + + return mask; +} + +// DESCRIPTION : +// sub_group_broadcast - each work_item registers it's own value. +// All work_items in subgroup takes one value from only one (any) work_item +// sub_group_broadcast_first - same as type 0. All work_items in +// subgroup takes only one value from only one chosen (the smallest subgroup ID) +// work_item +// sub_group_non_uniform_broadcast - same as type 0 but +// only 4 work_items from subgroup enter the code (are active) +template struct BC +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int i, ii, j, k, n; + int ng = test_params.global_workgroup_size; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int nj = (nw + ns - 1) / ns; + int d = ns > 100 ? 100 : ns; + int non_uniform_size = ng % nw; + ng = ng / nw; + int last_subgroup_size = 0; + ii = 0; + + log_info(" sub_group_%s(%s)...\n", operation_names(operation), + TypeManager::name()); + if (non_uniform_size) + { + log_info(" non uniform work group size mode ON\n"); + ng++; + } + for (k = 0; k < ng; ++k) + { // for each work_group + if (non_uniform_size && k == ng - 1) + { + set_last_workgroup_params(non_uniform_size, nj, ns, nw, + last_subgroup_size); + } + for (j = 0; j < nj; ++j) + { // for each subgroup + ii = j * ns; + if (last_subgroup_size && j == nj - 1) + { + n = last_subgroup_size; + } + else + { + n = ii + ns > nw ? nw - ii : ns; + } + int bcast_if = 0; + int bcast_elseif = 0; + int bcast_index = (int)(genrand_int32(gMTdata) & 0x7fffffff) + % (d > n ? n : d); + // l - calculate subgroup local id from which value will be + // broadcasted (one the same value for whole subgroup) + if (operation != SubgroupsBroadcastOp::broadcast) + { + // reduce brodcasting index in case of non_uniform and + // last workgroup last subgroup + if (last_subgroup_size && j == nj - 1 + && last_subgroup_size < NR_OF_ACTIVE_WORK_ITEMS) + { + bcast_if = bcast_index % last_subgroup_size; + bcast_elseif = bcast_if; + } + else + { + bcast_if = bcast_index % NR_OF_ACTIVE_WORK_ITEMS; + bcast_elseif = NR_OF_ACTIVE_WORK_ITEMS + + bcast_index % (n - NR_OF_ACTIVE_WORK_ITEMS); + } + } + + for (i = 0; i < n; ++i) + { + if (operation == SubgroupsBroadcastOp::broadcast) + { + int midx = 4 * ii + 4 * i + 2; + m[midx] = (cl_int)bcast_index; + } + else + { + if (i < NR_OF_ACTIVE_WORK_ITEMS) + { + // index of the third + // element int the vector. + int midx = 4 * ii + 4 * i + 2; + // storing information about + // broadcasting index - + // earlier calculated + m[midx] = (cl_int)bcast_if; + } + else + { // index of the third + // element int the vector. + int midx = 4 * ii + 4 * i + 3; + m[midx] = (cl_int)bcast_elseif; + } + } + + // calculate value for broadcasting + cl_ulong number = genrand_int64(gMTdata); + set_value(t[ii + i], number); + } + } + // Now map into work group using map from device + for (j = 0; j < nw; ++j) + { // for each element in work_group + // calculate index as number of subgroup + // plus subgroup local id + x[j] = t[j]; + } + x += nw; + m += 4 * nw; + } + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, l, n; + int ng = test_params.global_workgroup_size; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int nj = (nw + ns - 1) / ns; + Ty tr, rr; + int non_uniform_size = ng % nw; + ng = ng / nw; + int last_subgroup_size = 0; + if (non_uniform_size) ng++; + + for (k = 0; k < ng; ++k) + { // for each work_group + if (non_uniform_size && k == ng - 1) + { + set_last_workgroup_params(non_uniform_size, nj, ns, nw, + last_subgroup_size); + } + for (j = 0; j < nw; ++j) + { // inside the work_group + mx[j] = x[j]; // read host inputs for work_group + my[j] = y[j]; // read device outputs for work_group + } + + for (j = 0; j < nj; ++j) + { // for each subgroup + ii = j * ns; + if (last_subgroup_size && j == nj - 1) + { + n = last_subgroup_size; + } + else + { + n = ii + ns > nw ? nw - ii : ns; + } + + // Check result + if (operation == SubgroupsBroadcastOp::broadcast_first) + { + int lowest_active_id = -1; + for (i = 0; i < n; ++i) + { + + lowest_active_id = i < NR_OF_ACTIVE_WORK_ITEMS + ? 0 + : NR_OF_ACTIVE_WORK_ITEMS; + // findout if broadcasted + // value is the same + tr = mx[ii + lowest_active_id]; + // findout if broadcasted to all + rr = my[ii + i]; + + if (!compare(rr, tr)) + { + log_error( + "ERROR: sub_group_broadcast_first(%s) " + "mismatch " + "for local id %d in sub group %d in group " + "%d\n", + TypeManager::name(), i, j, k); + return TEST_FAIL; + } + } + } + else + { + for (i = 0; i < n; ++i) + { + if (operation == SubgroupsBroadcastOp::broadcast) + { + int midx = 4 * ii + 4 * i + 2; + l = (int)m[midx]; + tr = mx[ii + l]; + } + else + { + if (i < NR_OF_ACTIVE_WORK_ITEMS) + { // take index of array where info + // which work_item will be + // broadcast its value is stored + int midx = 4 * ii + 4 * i + 2; + // take subgroup local id of + // this work_item + l = (int)m[midx]; + // take value generated on host + // for this work_item + tr = mx[ii + l]; + } + else + { + int midx = 4 * ii + 4 * i + 3; + l = (int)m[midx]; + tr = mx[ii + l]; + } + } + rr = my[ii + i]; // read device outputs for + // work_item in the subgroup + + if (!compare(rr, tr)) + { + log_error("ERROR: sub_group_%s(%s) " + "mismatch for local id %d in sub " + "group %d in group %d - got %lu " + "expected %lu\n", + operation_names(operation), + TypeManager::name(), i, j, k, rr, tr); + return TEST_FAIL; + } + } + } + } + x += nw; + y += nw; + m += 4 * nw; + } + log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), + TypeManager::name()); + return TEST_PASS; + } +}; + +static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); } + +static subgroups::cl_half to_half(float x) +{ + subgroups::cl_half value; + value.data = cl_half_from_float(x, CL_HALF_RTE); + return value; +} + +// for integer types +template inline Ty calculate(Ty a, Ty b, ArithmeticOp operation) +{ + switch (operation) + { + case ArithmeticOp::add_: return a + b; + case ArithmeticOp::max_: return a > b ? a : b; + case ArithmeticOp::min_: return a < b ? a : b; + case ArithmeticOp::mul_: return a * b; + case ArithmeticOp::and_: return a & b; + case ArithmeticOp::or_: return a | b; + case ArithmeticOp::xor_: return a ^ b; + case ArithmeticOp::logical_and: return a && b; + case ArithmeticOp::logical_or: return a || b; + case ArithmeticOp::logical_xor: return !a ^ !b; + default: log_error("Unknown operation request"); break; + } + return 0; +} +// Specialize for floating points. +template <> +inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation) +{ + switch (operation) + { + case ArithmeticOp::add_: { + return a + b; + } + case ArithmeticOp::max_: { + return a > b ? a : b; + } + case ArithmeticOp::min_: { + return a < b ? a : b; + } + case ArithmeticOp::mul_: { + return a * b; + } + default: log_error("Unknown operation request"); break; + } + return 0; +} + +template <> +inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation) +{ + switch (operation) + { + case ArithmeticOp::add_: { + return a + b; + } + case ArithmeticOp::max_: { + return a > b ? a : b; + } + case ArithmeticOp::min_: { + return a < b ? a : b; + } + case ArithmeticOp::mul_: { + return a * b; + } + default: log_error("Unknown operation request"); break; + } + return 0; +} + +template <> +inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b, + ArithmeticOp operation) +{ + switch (operation) + { + case ArithmeticOp::add_: return to_half(to_float(a) + to_float(b)); + case ArithmeticOp::max_: + return to_float(a) > to_float(b) || is_half_nan(b.data) ? a : b; + case ArithmeticOp::min_: + return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b; + case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b)); + default: log_error("Unknown operation request"); break; + } + return to_half(0); +} + +template bool is_floating_point() +{ + return std::is_floating_point::value + || std::is_same::value; +} + +template +void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) +{ + int nj = (nw + ns - 1) / ns; + + for (int k = 0; k < ng; ++k) + { + for (int j = 0; j < nj; ++j) + { + int ii = j * ns; + int n = ii + ns > nw ? nw - ii : ns; + + for (int i = 0; i < n; ++i) + { + cl_ulong out_value; + double y; + if (operation == ArithmeticOp::mul_ + || operation == ArithmeticOp::add_) + { + // work around to avoid overflow, do not use 0 for + // multiplication + out_value = (genrand_int32(gMTdata) % 4) + 1; + } + else + { + out_value = genrand_int64(gMTdata) % (32 * n); + if ((operation == ArithmeticOp::logical_and + || operation == ArithmeticOp::logical_or + || operation == ArithmeticOp::logical_xor) + && ((out_value >> 32) & 1) == 0) + out_value = 0; // increase probability of false + } + set_value(t[ii + i], out_value); + } + } + + // Now map into work group using map from device + for (int j = 0; j < nw; ++j) + { + x[j] = t[j]; + } + + x += nw; + m += 4 * nw; + } +} + +template struct SHF +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int i, ii, j, k, l, n, delta; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + int nj = (nw + ns - 1) / ns; + int d = ns > 100 ? 100 : ns; + ii = 0; + ng = ng / nw; + log_info(" sub_group_%s(%s)...\n", operation_names(operation), + TypeManager::name()); + for (k = 0; k < ng; ++k) + { // for each work_group + for (j = 0; j < nj; ++j) + { // for each subgroup + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + for (i = 0; i < n; ++i) + { + int midx = 4 * ii + 4 * i + 2; + l = (int)(genrand_int32(gMTdata) & 0x7fffffff) + % (d > n ? n : d); + switch (operation) + { + case ShuffleOp::shuffle: + case ShuffleOp::shuffle_xor: + // storing information about shuffle index + m[midx] = (cl_int)l; + break; + case ShuffleOp::shuffle_up: + delta = l; // calculate delta for shuffle up + if (i - delta < 0) + { + delta = i; + } + m[midx] = (cl_int)delta; + break; + case ShuffleOp::shuffle_down: + delta = l; // calculate delta for shuffle down + if (i + delta >= n) + { + delta = n - 1 - i; + } + m[midx] = (cl_int)delta; + break; + default: break; + } + cl_ulong number = genrand_int64(gMTdata); + set_value(t[ii + i], number); + } + } + // Now map into work group using map from device + for (j = 0; j < nw; ++j) + { // for each element in work_group + x[j] = t[j]; + } + x += nw; + m += 4 * nw; + } + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, l, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + int nj = (nw + ns - 1) / ns; + Ty tr, rr; + ng = ng / nw; + + for (k = 0; k < ng; ++k) + { // for each work_group + for (j = 0; j < nw; ++j) + { // inside the work_group + mx[j] = x[j]; // read host inputs for work_group + my[j] = y[j]; // read device outputs for work_group + } + + for (j = 0; j < nj; ++j) + { // for each subgroup + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + + for (i = 0; i < n; ++i) + { // inside the subgroup + // shuffle index storage + int midx = 4 * ii + 4 * i + 2; + l = (int)m[midx]; + rr = my[ii + i]; + switch (operation) + { + // shuffle basic - treat l as index + case ShuffleOp::shuffle: tr = mx[ii + l]; break; + // shuffle up - treat l as delta + case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break; + // shuffle up - treat l as delta + case ShuffleOp::shuffle_down: + tr = mx[ii + i + l]; + break; + // shuffle xor - treat l as mask + case ShuffleOp::shuffle_xor: + tr = mx[ii + (i ^ l)]; + break; + default: break; + } + + if (!compare(rr, tr)) + { + log_error("ERROR: sub_group_%s(%s) mismatch for " + "local id %d in sub group %d in group %d\n", + operation_names(operation), + TypeManager::name(), i, j, k); + return TEST_FAIL; + } + } + } + x += nw; + y += nw; + m += 4 * nw; + } + log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), + TypeManager::name()); + return TEST_PASS; + } +}; + +template struct SCEX_NU +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + ng = ng / nw; + std::string func_name; + work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" + : func_name = "sub_group_scan_exclusive"; + log_info(" %s_%s(%s)...\n", func_name.c_str(), + operation_names(operation), TypeManager::name()); + log_info(" test params: global size = %d local size = %d subgroups " + "size = %d work item mask = 0x%x \n", + test_params.global_workgroup_size, nw, ns, work_items_mask); + genrand(x, t, m, ns, nw, ng); + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; + Ty tr, rr; + ng = ng / nw; + + std::string func_name; + work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" + : func_name = "sub_group_scan_exclusive"; + + uint32_t use_work_items_mask; + // for uniform case take into consideration all workitems + use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; + for (k = 0; k < ng; ++k) + { // for each work_group + // Map to array indexed to array indexed by local ID and sub group + for (j = 0; j < nw; ++j) + { // inside the work_group + mx[j] = x[j]; // read host inputs for work_group + my[j] = y[j]; // read device outputs for work_group + } + for (j = 0; j < nj; ++j) + { + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + std::set active_work_items; + for (i = 0; i < n; ++i) + { + uint32_t check_work_item = 1 << (i % 32); + if (use_work_items_mask & check_work_item) + { + active_work_items.insert(i); + } + } + if (active_work_items.empty()) + { + log_info(" No acitve workitems in workgroup id = %d " + "subgroup id = %d - no calculation\n", + k, j); + continue; + } + else if (active_work_items.size() == 1) + { + log_info(" One active workitem in workgroup id = %d " + "subgroup id = %d - no calculation\n", + k, j); + continue; + } + else + { + tr = TypeManager::identify_limits(operation); + int idx = 0; + for (const int &active_work_item : active_work_items) + { + rr = my[ii + active_work_item]; + if (idx == 0) continue; + + if (!compare_ordered(rr, tr)) + { + log_error( + "ERROR: %s_%s(%s) " + "mismatch for local id %d in sub group %d in " + "group %d Expected: %d Obtained: %d\n", + func_name.c_str(), operation_names(operation), + TypeManager::name(), i, j, k, tr, rr); + return TEST_FAIL; + } + tr = calculate(tr, mx[ii + active_work_item], + operation); + idx++; + } + } + } + x += nw; + y += nw; + m += 4 * nw; + } + + log_info(" %s_%s(%s)... passed\n", func_name.c_str(), + operation_names(operation), TypeManager::name()); + return TEST_PASS; + } +}; + +// Test for scan inclusive non uniform functions +template struct SCIN_NU +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + ng = ng / nw; + std::string func_name; + work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" + : func_name = "sub_group_scan_inclusive"; + + genrand(x, t, m, ns, nw, ng); + log_info(" %s_%s(%s)...\n", func_name.c_str(), + operation_names(operation), TypeManager::name()); + log_info(" test params: global size = %d local size = %d subgroups " + "size = %d work item mask = 0x%x \n", + test_params.global_workgroup_size, nw, ns, work_items_mask); + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; + Ty tr, rr; + ng = ng / nw; + + std::string func_name; + work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" + : func_name = "sub_group_scan_inclusive"; + + uint32_t use_work_items_mask; + // for uniform case take into consideration all workitems + use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; + // std::bitset<32> mask32(use_work_items_mask); + // for (int k) mask32.count(); + for (k = 0; k < ng; ++k) + { // for each work_group + // Map to array indexed to array indexed by local ID and sub group + for (j = 0; j < nw; ++j) + { // inside the work_group + mx[j] = x[j]; // read host inputs for work_group + my[j] = y[j]; // read device outputs for work_group + } + for (j = 0; j < nj; ++j) + { + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + std::set active_work_items; + int catch_frist_active = -1; + + for (i = 0; i < n; ++i) + { + uint32_t check_work_item = 1 << (i % 32); + if (use_work_items_mask & check_work_item) + { + if (catch_frist_active == -1) + { + catch_frist_active = i; + } + active_work_items.insert(i); + } + } + if (active_work_items.empty()) + { + log_info(" No acitve workitems in workgroup id = %d " + "subgroup id = %d - no calculation\n", + k, j); + continue; + } + else + { + tr = TypeManager::identify_limits(operation); + for (const int &active_work_item : active_work_items) + { + rr = my[ii + active_work_item]; + if (active_work_items.size() == 1) + { + tr = mx[ii + catch_frist_active]; + } + else + { + tr = calculate(tr, mx[ii + active_work_item], + operation); + } + if (!compare_ordered(rr, tr)) + { + log_error( + "ERROR: %s_%s(%s) " + "mismatch for local id %d in sub group %d " + "in " + "group %d Expected: %d Obtained: %d\n", + func_name.c_str(), operation_names(operation), + TypeManager::name(), active_work_item, j, k, + tr, rr); + return TEST_FAIL; + } + } + } + } + x += nw; + y += nw; + m += 4 * nw; + } + + log_info(" %s_%s(%s)... passed\n", func_name.c_str(), + operation_names(operation), TypeManager::name()); + return TEST_PASS; + } +}; + +// Test for reduce non uniform functions +template struct RED_NU +{ + + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + ng = ng / nw; + std::string func_name; + + work_items_mask ? func_name = "sub_group_non_uniform_reduce" + : func_name = "sub_group_reduce"; + log_info(" %s_%s(%s)...\n", func_name.c_str(), + operation_names(operation), TypeManager::name()); + log_info(" test params: global size = %d local size = %d subgroups " + "size = %d work item mask = 0x%x \n", + test_params.global_workgroup_size, nw, ns, work_items_mask); + genrand(x, t, m, ns, nw, ng); + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; + ng = ng / nw; + Ty tr, rr; + + std::string func_name; + work_items_mask ? func_name = "sub_group_non_uniform_reduce" + : func_name = "sub_group_reduce"; + + for (k = 0; k < ng; ++k) + { + // Map to array indexed to array indexed by local ID and sub + // group + for (j = 0; j < nw; ++j) + { + mx[j] = x[j]; + my[j] = y[j]; + } + + uint32_t use_work_items_mask; + use_work_items_mask = + !work_items_mask ? 0xFFFFFFFF : work_items_mask; + + for (j = 0; j < nj; ++j) + { + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + std::set active_work_items; + int catch_frist_active = -1; + for (i = 0; i < n; ++i) + { + uint32_t check_work_item = 1 << (i % 32); + if (use_work_items_mask & check_work_item) + { + if (catch_frist_active == -1) + { + catch_frist_active = i; + tr = mx[ii + i]; + active_work_items.insert(i); + continue; + } + active_work_items.insert(i); + tr = calculate(tr, mx[ii + i], operation); + } + } + + if (active_work_items.empty()) + { + log_info(" No acitve workitems in workgroup id = %d " + "subgroup id = %d - no calculation\n", + k, j); + continue; + } + + for (const int &active_work_item : active_work_items) + { + rr = my[ii + active_work_item]; + if (!compare_ordered(rr, tr)) + { + log_error("ERROR: %s_%s(%s) " + "mismatch for local id %d in sub group %d in " + "group %d Expected: %d Obtained: %d\n", + func_name.c_str(), operation_names(operation), + TypeManager::name(), active_work_item, j, + k, tr, rr); + return TEST_FAIL; + } + } + } + x += nw; + y += nw; + m += 4 * nw; + } + + log_info(" %s_%s(%s)... passed\n", func_name.c_str(), + operation_names(operation), TypeManager::name()); + return TEST_PASS; + } +}; + +#endif diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h index dc49af2d..93673b35 100644 --- a/test_conformance/subgroups/subhelpers.h +++ b/test_conformance/subgroups/subhelpers.h @@ -19,13 +19,176 @@ #include "testHarness.h" #include "kernelHelpers.h" #include "typeWrappers.h" +#include "imageHelpers.h" #include #include +#include + +#define NR_OF_ACTIVE_WORK_ITEMS 4 + +extern MTdata gMTdata; + +struct WorkGroupParams +{ + WorkGroupParams(size_t gws, size_t lws, + const std::vector &req_ext = {}, + const std::vector &all_wim = {}) + : global_workgroup_size(gws), local_workgroup_size(lws), + required_extensions(req_ext), all_work_item_masks(all_wim) + { + subgroup_size = 0; + work_items_mask = 0; + use_core_subgroups = true; + dynsc = 0; + } + size_t global_workgroup_size; + size_t local_workgroup_size; + size_t subgroup_size; + uint32_t work_items_mask; + int dynsc; + bool use_core_subgroups; + std::vector required_extensions; + std::vector all_work_item_masks; +}; + +enum class SubgroupsBroadcastOp +{ + broadcast, + broadcast_first, + non_uniform_broadcast +}; + +enum class NonUniformVoteOp +{ + elect, + all, + any, + all_equal +}; + +enum class BallotOp +{ + ballot, + inverse_ballot, + ballot_bit_extract, + ballot_bit_count, + ballot_inclusive_scan, + ballot_exclusive_scan, + ballot_find_lsb, + ballot_find_msb, + eq_mask, + ge_mask, + gt_mask, + le_mask, + lt_mask, +}; + +enum class ShuffleOp +{ + shuffle, + shuffle_up, + shuffle_down, + shuffle_xor +}; + +enum class ArithmeticOp +{ + add_, + max_, + min_, + mul_, + and_, + or_, + xor_, + logical_and, + logical_or, + logical_xor +}; + +static const char *const operation_names(ArithmeticOp operation) +{ + switch (operation) + { + case ArithmeticOp::add_: return "add"; + case ArithmeticOp::max_: return "max"; + case ArithmeticOp::min_: return "min"; + case ArithmeticOp::mul_: return "mul"; + case ArithmeticOp::and_: return "and"; + case ArithmeticOp::or_: return "or"; + case ArithmeticOp::xor_: return "xor"; + case ArithmeticOp::logical_and: return "logical_and"; + case ArithmeticOp::logical_or: return "logical_or"; + case ArithmeticOp::logical_xor: return "logical_xor"; + default: log_error("Unknown operation request"); break; + } + return ""; +} + +static const char *const operation_names(BallotOp operation) +{ + switch (operation) + { + case BallotOp::ballot: return "ballot"; + case BallotOp::inverse_ballot: return "inverse_ballot"; + case BallotOp::ballot_bit_extract: return "bit_extract"; + case BallotOp::ballot_bit_count: return "bit_count"; + case BallotOp::ballot_inclusive_scan: return "inclusive_scan"; + case BallotOp::ballot_exclusive_scan: return "exclusive_scan"; + case BallotOp::ballot_find_lsb: return "find_lsb"; + case BallotOp::ballot_find_msb: return "find_msb"; + case BallotOp::eq_mask: return "eq"; + case BallotOp::ge_mask: return "ge"; + case BallotOp::gt_mask: return "gt"; + case BallotOp::le_mask: return "le"; + case BallotOp::lt_mask: return "lt"; + default: log_error("Unknown operation request"); break; + } + return ""; +} + +static const char *const operation_names(ShuffleOp operation) +{ + switch (operation) + { + case ShuffleOp::shuffle: return "shuffle"; + case ShuffleOp::shuffle_up: return "shuffle_up"; + case ShuffleOp::shuffle_down: return "shuffle_down"; + case ShuffleOp::shuffle_xor: return "shuffle_xor"; + default: log_error("Unknown operation request"); break; + } + return ""; +} + +static const char *const operation_names(NonUniformVoteOp operation) +{ + switch (operation) + { + case NonUniformVoteOp::all: return "all"; + case NonUniformVoteOp::all_equal: return "all_equal"; + case NonUniformVoteOp::any: return "any"; + case NonUniformVoteOp::elect: return "elect"; + default: log_error("Unknown operation request"); break; + } + return ""; +} + +static const char *const operation_names(SubgroupsBroadcastOp operation) +{ + switch (operation) + { + case SubgroupsBroadcastOp::broadcast: return "broadcast"; + case SubgroupsBroadcastOp::broadcast_first: return "broadcast_first"; + case SubgroupsBroadcastOp::non_uniform_broadcast: + return "non_uniform_broadcast"; + default: log_error("Unknown operation request"); break; + } + return ""; +} class subgroupsAPI { public: - subgroupsAPI(cl_platform_id platform, bool useCoreSubgroups) + subgroupsAPI(cl_platform_id platform, bool use_core_subgroups) { static_assert(CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR, @@ -33,7 +196,7 @@ public: static_assert(CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR, "Enums have to be the same"); - if (useCoreSubgroups) + if (use_core_subgroups) { _clGetKernelSubGroupInfo_ptr = &clGetKernelSubGroupInfo; clGetKernelSubGroupInfo_name = "clGetKernelSubGroupInfo"; @@ -56,163 +219,76 @@ private: clGetKernelSubGroupInfoKHR_fn _clGetKernelSubGroupInfo_ptr; }; -// Some template helpers -template struct TypeName; -template <> struct TypeName +// Need to defined custom type for vector size = 3 and half type. This is +// because of 3-component types are otherwise indistinguishable from the +// 4-component types, and because the half type is indistinguishable from some +// other 16-bit type (ushort) +namespace subgroups { +struct cl_char3 { - static const char *val() { return "half"; } + ::cl_char3 data; }; -template <> struct TypeName +struct cl_uchar3 { - static const char *val() { return "uint"; } + ::cl_uchar3 data; }; -template <> struct TypeName +struct cl_short3 { - static const char *val() { return "int"; } + ::cl_short3 data; }; -template <> struct TypeName +struct cl_ushort3 { - static const char *val() { return "ulong"; } + ::cl_ushort3 data; }; -template <> struct TypeName +struct cl_int3 { - static const char *val() { return "long"; } + ::cl_int3 data; }; -template <> struct TypeName +struct cl_uint3 { - static const char *val() { return "float"; } + ::cl_uint3 data; }; -template <> struct TypeName +struct cl_long3 { - static const char *val() { return "double"; } + ::cl_long3 data; }; - -template struct TypeDef; -template <> struct TypeDef +struct cl_ulong3 { - static const char *val() { return "typedef half Type;\n"; } + ::cl_ulong3 data; }; -template <> struct TypeDef +struct cl_float3 { - static const char *val() { return "typedef uint Type;\n"; } + ::cl_float3 data; }; -template <> struct TypeDef +struct cl_double3 { - static const char *val() { return "typedef int Type;\n"; } + ::cl_double3 data; }; -template <> struct TypeDef +struct cl_half { - static const char *val() { return "typedef ulong Type;\n"; } + ::cl_half data; }; -template <> struct TypeDef +struct cl_half2 { - static const char *val() { return "typedef long Type;\n"; } + ::cl_half2 data; }; -template <> struct TypeDef +struct cl_half3 { - static const char *val() { return "typedef float Type;\n"; } + ::cl_half3 data; }; -template <> struct TypeDef +struct cl_half4 { - static const char *val() { return "typedef double Type;\n"; } + ::cl_half4 data; }; - -template struct TypeIdentity; -// template <> struct TypeIdentity { static cl_half val() { return -// (cl_half)0.0; } }; template <> struct TypeIdentity { static -// cl_half val() { return -(cl_half)65536.0; } }; template <> struct -// TypeIdentity { static cl_half val() { return (cl_half)65536.0; } -// }; - -template <> struct TypeIdentity +struct cl_half8 { - static cl_uint val() { return (cl_uint)0; } + ::cl_half8 data; }; -template <> struct TypeIdentity +struct cl_half16 { - static cl_uint val() { return (cl_uint)0; } -}; -template <> struct TypeIdentity -{ - static cl_uint val() { return (cl_uint)0xffffffff; } -}; - -template <> struct TypeIdentity -{ - static cl_int val() { return (cl_int)0; } -}; -template <> struct TypeIdentity -{ - static cl_int val() { return (cl_int)0x80000000; } -}; -template <> struct TypeIdentity -{ - static cl_int val() { return (cl_int)0x7fffffff; } -}; - -template <> struct TypeIdentity -{ - static cl_ulong val() { return (cl_ulong)0; } -}; -template <> struct TypeIdentity -{ - static cl_ulong val() { return (cl_ulong)0; } -}; -template <> struct TypeIdentity -{ - static cl_ulong val() { return (cl_ulong)0xffffffffffffffffULL; } -}; - -template <> struct TypeIdentity -{ - static cl_long val() { return (cl_long)0; } -}; -template <> struct TypeIdentity -{ - static cl_long val() { return (cl_long)0x8000000000000000ULL; } -}; -template <> struct TypeIdentity -{ - static cl_long val() { return (cl_long)0x7fffffffffffffffULL; } -}; - - -template <> struct TypeIdentity -{ - static float val() { return 0.F; } -}; -template <> struct TypeIdentity -{ - static float val() { return -std::numeric_limits::infinity(); } -}; -template <> struct TypeIdentity -{ - static float val() { return std::numeric_limits::infinity(); } -}; - -template <> struct TypeIdentity -{ - static double val() { return 0.L; } -}; - -template <> struct TypeIdentity -{ - static double val() { return -std::numeric_limits::infinity(); } -}; -template <> struct TypeIdentity -{ - static double val() { return std::numeric_limits::infinity(); } -}; - -template struct TypeCheck; -template <> struct TypeCheck -{ - static bool val(cl_device_id) { return true; } -}; -template <> struct TypeCheck -{ - static bool val(cl_device_id) { return true; } + ::cl_half16 data; }; +} static bool int64_ok(cl_device_id device) { @@ -233,43 +309,860 @@ static bool int64_ok(cl_device_id device) return true; } -template <> struct TypeCheck +static bool double_ok(cl_device_id device) { - static bool val(cl_device_id device) { return int64_ok(device); } -}; -template <> struct TypeCheck -{ - static bool val(cl_device_id device) { return int64_ok(device); } -}; -template <> struct TypeCheck -{ - static bool val(cl_device_id) { return true; } -}; -template <> struct TypeCheck -{ - static bool val(cl_device_id device) + int error; + cl_device_fp_config c; + error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c), + (void *)&c, NULL); + if (error) { - return is_extension_available(device, "cl_khr_fp16"); + log_info("clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n"); + return false; } -}; -template <> struct TypeCheck + return c != 0; +} + +static bool half_ok(cl_device_id device) { - static bool val(cl_device_id device) + int error; + cl_device_fp_config c; + error = clGetDeviceInfo(device, CL_DEVICE_HALF_FP_CONFIG, sizeof(c), + (void *)&c, NULL); + if (error) { - int error; - cl_device_fp_config c; - error = clGetDeviceInfo(device, CL_DEVICE_DOUBLE_FP_CONFIG, sizeof(c), - (void *)&c, NULL); - if (error) + log_info("clGetDeviceInfo failed with CL_DEVICE_HALF_FP_CONFIG\n"); + return false; + } + return c != 0; +} + +template struct CommonTypeManager +{ + + static const char *name() { return ""; } + static const char *add_typedef() { return "\n"; } + typedef std::false_type is_vector_type; + typedef std::false_type is_sb_vector_size3; + typedef std::false_type is_sb_vector_type; + typedef std::false_type is_sb_scalar_type; + static const bool type_supported(cl_device_id) { return true; } + static const Ty identify_limits(ArithmeticOp operation) + { + switch (operation) { - log_info( - "clGetDeviceInfo failed with CL_DEVICE_DOUBLE_FP_CONFIG\n"); - return false; + case ArithmeticOp::add_: return (Ty)0; + case ArithmeticOp::max_: return (std::numeric_limits::min)(); + case ArithmeticOp::min_: return (std::numeric_limits::max)(); + case ArithmeticOp::mul_: return (Ty)1; + case ArithmeticOp::and_: return (Ty)~0; + case ArithmeticOp::or_: return (Ty)0; + case ArithmeticOp::xor_: return (Ty)0; + default: log_error("Unknown operation request"); break; } - return c != 0; + return 0; } }; +template struct TypeManager; + +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "int"; } + static const char *add_typedef() { return "typedef int Type;\n"; } + static cl_int identify_limits(ArithmeticOp operation) + { + switch (operation) + { + case ArithmeticOp::add_: return (cl_int)0; + case ArithmeticOp::max_: + return (std::numeric_limits::min)(); + case ArithmeticOp::min_: + return (std::numeric_limits::max)(); + case ArithmeticOp::mul_: return (cl_int)1; + case ArithmeticOp::and_: return (cl_int)~0; + case ArithmeticOp::or_: return (cl_int)0; + case ArithmeticOp::xor_: return (cl_int)0; + case ArithmeticOp::logical_and: return (cl_int)1; + case ArithmeticOp::logical_or: return (cl_int)0; + case ArithmeticOp::logical_xor: return (cl_int)0; + default: log_error("Unknown operation request"); break; + } + return 0; + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "int2"; } + static const char *add_typedef() { return "typedef int2 Type;\n"; } + typedef std::true_type is_vector_type; + using scalar_type = cl_int; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "int3"; } + static const char *add_typedef() { return "typedef int3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_int; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "int4"; } + static const char *add_typedef() { return "typedef int4 Type;\n"; } + using scalar_type = cl_int; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "int8"; } + static const char *add_typedef() { return "typedef int8 Type;\n"; } + using scalar_type = cl_int; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "int16"; } + static const char *add_typedef() { return "typedef int16 Type;\n"; } + using scalar_type = cl_int; + typedef std::true_type is_vector_type; +}; +// cl_uint +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uint"; } + static const char *add_typedef() { return "typedef uint Type;\n"; } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uint2"; } + static const char *add_typedef() { return "typedef uint2 Type;\n"; } + using scalar_type = cl_uint; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "uint3"; } + static const char *add_typedef() { return "typedef uint3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_uint; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uint4"; } + static const char *add_typedef() { return "typedef uint4 Type;\n"; } + using scalar_type = cl_uint; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uint8"; } + static const char *add_typedef() { return "typedef uint8 Type;\n"; } + using scalar_type = cl_uint; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uint16"; } + static const char *add_typedef() { return "typedef uint16 Type;\n"; } + using scalar_type = cl_uint; + typedef std::true_type is_vector_type; +}; +// cl_short +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "short"; } + static const char *add_typedef() { return "typedef short Type;\n"; } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "short2"; } + static const char *add_typedef() { return "typedef short2 Type;\n"; } + using scalar_type = cl_short; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "short3"; } + static const char *add_typedef() { return "typedef short3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_short; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "short4"; } + static const char *add_typedef() { return "typedef short4 Type;\n"; } + using scalar_type = cl_short; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "short8"; } + static const char *add_typedef() { return "typedef short8 Type;\n"; } + using scalar_type = cl_short; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "short16"; } + static const char *add_typedef() { return "typedef short16 Type;\n"; } + using scalar_type = cl_short; + typedef std::true_type is_vector_type; +}; +// cl_ushort +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ushort"; } + static const char *add_typedef() { return "typedef ushort Type;\n"; } +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ushort2"; } + static const char *add_typedef() { return "typedef ushort2 Type;\n"; } + using scalar_type = cl_ushort; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "ushort3"; } + static const char *add_typedef() { return "typedef ushort3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_ushort; +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ushort4"; } + static const char *add_typedef() { return "typedef ushort4 Type;\n"; } + using scalar_type = cl_ushort; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ushort8"; } + static const char *add_typedef() { return "typedef ushort8 Type;\n"; } + using scalar_type = cl_ushort; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ushort16"; } + static const char *add_typedef() { return "typedef ushort16 Type;\n"; } + using scalar_type = cl_ushort; + typedef std::true_type is_vector_type; +}; +// cl_char +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "char"; } + static const char *add_typedef() { return "typedef char Type;\n"; } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "char2"; } + static const char *add_typedef() { return "typedef char2 Type;\n"; } + using scalar_type = cl_char; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "char3"; } + static const char *add_typedef() { return "typedef char3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_char; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "char4"; } + static const char *add_typedef() { return "typedef char4 Type;\n"; } + using scalar_type = cl_char; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "char8"; } + static const char *add_typedef() { return "typedef char8 Type;\n"; } + using scalar_type = cl_char; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "char16"; } + static const char *add_typedef() { return "typedef char16 Type;\n"; } + using scalar_type = cl_char; + typedef std::true_type is_vector_type; +}; +// cl_uchar +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uchar"; } + static const char *add_typedef() { return "typedef uchar Type;\n"; } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uchar2"; } + static const char *add_typedef() { return "typedef uchar2 Type;\n"; } + using scalar_type = cl_uchar; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "uchar3"; } + static const char *add_typedef() { return "typedef uchar3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_uchar; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uchar4"; } + static const char *add_typedef() { return "typedef uchar4 Type;\n"; } + using scalar_type = cl_uchar; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uchar8"; } + static const char *add_typedef() { return "typedef uchar8 Type;\n"; } + using scalar_type = cl_uchar; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "uchar16"; } + static const char *add_typedef() { return "typedef uchar16 Type;\n"; } + using scalar_type = cl_uchar; + typedef std::true_type is_vector_type; +}; +// cl_long +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "long"; } + static const char *add_typedef() { return "typedef long Type;\n"; } + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "long2"; } + static const char *add_typedef() { return "typedef long2 Type;\n"; } + using scalar_type = cl_long; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "long3"; } + static const char *add_typedef() { return "typedef long3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_long; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "long4"; } + static const char *add_typedef() { return "typedef long4 Type;\n"; } + using scalar_type = cl_long; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "long8"; } + static const char *add_typedef() { return "typedef long8 Type;\n"; } + using scalar_type = cl_long; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "long16"; } + static const char *add_typedef() { return "typedef long16 Type;\n"; } + using scalar_type = cl_long; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +// cl_ulong +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ulong"; } + static const char *add_typedef() { return "typedef ulong Type;\n"; } + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ulong2"; } + static const char *add_typedef() { return "typedef ulong2 Type;\n"; } + using scalar_type = cl_ulong; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "ulong3"; } + static const char *add_typedef() { return "typedef ulong3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_ulong; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ulong4"; } + static const char *add_typedef() { return "typedef ulong4 Type;\n"; } + using scalar_type = cl_ulong; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ulong8"; } + static const char *add_typedef() { return "typedef ulong8 Type;\n"; } + using scalar_type = cl_ulong; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "ulong16"; } + static const char *add_typedef() { return "typedef ulong16 Type;\n"; } + using scalar_type = cl_ulong; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return int64_ok(device); + } +}; + +// cl_float +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "float"; } + static const char *add_typedef() { return "typedef float Type;\n"; } + static cl_float identify_limits(ArithmeticOp operation) + { + switch (operation) + { + case ArithmeticOp::add_: return 0.0f; + case ArithmeticOp::max_: + return -std::numeric_limits::infinity(); + case ArithmeticOp::min_: + return std::numeric_limits::infinity(); + case ArithmeticOp::mul_: return (cl_float)1; + default: log_error("Unknown operation request"); break; + } + return 0; + } +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "float2"; } + static const char *add_typedef() { return "typedef float2 Type;\n"; } + using scalar_type = cl_float; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "float3"; } + static const char *add_typedef() { return "typedef float3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_float; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "float4"; } + static const char *add_typedef() { return "typedef float4 Type;\n"; } + using scalar_type = cl_float; + typedef std::true_type is_vector_type; +}; +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "float8"; } + static const char *add_typedef() { return "typedef float8 Type;\n"; } + using scalar_type = cl_float; + typedef std::true_type is_vector_type; +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "float16"; } + static const char *add_typedef() { return "typedef float16 Type;\n"; } + using scalar_type = cl_float; + typedef std::true_type is_vector_type; +}; + +// cl_double +template <> struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "double"; } + static const char *add_typedef() { return "typedef double Type;\n"; } + static cl_double identify_limits(ArithmeticOp operation) + { + switch (operation) + { + case ArithmeticOp::add_: return 0.0; + case ArithmeticOp::max_: + return -std::numeric_limits::infinity(); + case ArithmeticOp::min_: + return std::numeric_limits::infinity(); + case ArithmeticOp::mul_: return (cl_double)1; + default: log_error("Unknown operation request"); break; + } + return 0; + } + static const bool type_supported(cl_device_id device) + { + return double_ok(device); + } +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "double2"; } + static const char *add_typedef() { return "typedef double2 Type;\n"; } + using scalar_type = cl_double; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return double_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "double3"; } + static const char *add_typedef() { return "typedef double3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = cl_double; + static const bool type_supported(cl_device_id device) + { + return double_ok(device); + } +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "double4"; } + static const char *add_typedef() { return "typedef double4 Type;\n"; } + using scalar_type = cl_double; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return double_ok(device); + } +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "double8"; } + static const char *add_typedef() { return "typedef double8 Type;\n"; } + using scalar_type = cl_double; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return double_ok(device); + } +}; +template <> +struct TypeManager : public CommonTypeManager +{ + static const char *name() { return "double16"; } + static const char *add_typedef() { return "typedef double16 Type;\n"; } + using scalar_type = cl_double; + typedef std::true_type is_vector_type; + static const bool type_supported(cl_device_id device) + { + return double_ok(device); + } +}; + +// cl_half +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "half"; } + static const char *add_typedef() { return "typedef half Type;\n"; } + typedef std::true_type is_sb_scalar_type; + static subgroups::cl_half identify_limits(ArithmeticOp operation) + { + switch (operation) + { + case ArithmeticOp::add_: return { 0x0000 }; + case ArithmeticOp::max_: return { 0xfc00 }; + case ArithmeticOp::min_: return { 0x7c00 }; + case ArithmeticOp::mul_: return { 0x3c00 }; + default: log_error("Unknown operation request"); break; + } + return { 0 }; + } + static const bool type_supported(cl_device_id device) + { + return half_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "half2"; } + static const char *add_typedef() { return "typedef half2 Type;\n"; } + using scalar_type = subgroups::cl_half; + typedef std::true_type is_sb_vector_type; + static const bool type_supported(cl_device_id device) + { + return half_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "half3"; } + static const char *add_typedef() { return "typedef half3 Type;\n"; } + typedef std::true_type is_sb_vector_size3; + using scalar_type = subgroups::cl_half; + + static const bool type_supported(cl_device_id device) + { + return half_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "half4"; } + static const char *add_typedef() { return "typedef half4 Type;\n"; } + using scalar_type = subgroups::cl_half; + typedef std::true_type is_sb_vector_type; + static const bool type_supported(cl_device_id device) + { + return half_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "half8"; } + static const char *add_typedef() { return "typedef half8 Type;\n"; } + using scalar_type = subgroups::cl_half; + typedef std::true_type is_sb_vector_type; + + static const bool type_supported(cl_device_id device) + { + return half_ok(device); + } +}; +template <> +struct TypeManager + : public CommonTypeManager +{ + static const char *name() { return "half16"; } + static const char *add_typedef() { return "typedef half16 Type;\n"; } + using scalar_type = subgroups::cl_half; + typedef std::true_type is_sb_vector_type; + static const bool type_supported(cl_device_id device) + { + return half_ok(device); + } +}; + +// set scalar value to vector of halfs +template +typename std::enable_if::is_sb_vector_type::value>::type +set_value(Ty &lhs, const cl_ulong &rhs) +{ + const int size = sizeof(Ty) / sizeof(typename TypeManager::scalar_type); + for (auto i = 0; i < size; ++i) + { + lhs.data.s[i] = rhs; + } +} + + +// set scalar value to vector +template +typename std::enable_if::is_vector_type::value>::type +set_value(Ty &lhs, const cl_ulong &rhs) +{ + const int size = sizeof(Ty) / sizeof(typename TypeManager::scalar_type); + for (auto i = 0; i < size; ++i) + { + lhs.s[i] = rhs; + } +} + +// set vector to vector value +template +typename std::enable_if::is_vector_type::value>::type +set_value(Ty &lhs, const Ty &rhs) +{ + lhs = rhs; +} + +// set scalar value to vector size 3 +template +typename std::enable_if::is_sb_vector_size3::value>::type +set_value(Ty &lhs, const cl_ulong &rhs) +{ + for (auto i = 0; i < 3; ++i) + { + lhs.data.s[i] = rhs; + } +} + +// set scalar value to scalar +template +typename std::enable_if::value>::type +set_value(Ty &lhs, const cl_ulong &rhs) +{ + lhs = static_cast(rhs); +} + +// set scalar value to half scalar +template +typename std::enable_if::is_sb_scalar_type::value>::type +set_value(Ty &lhs, const cl_ulong &rhs) +{ + lhs.data = rhs; +} + +// compare for common vectors +template +typename std::enable_if::is_vector_type::value, bool>::type +compare(const Ty &lhs, const Ty &rhs) +{ + const int size = sizeof(Ty) / sizeof(typename TypeManager::scalar_type); + for (auto i = 0; i < size; ++i) + { + if (lhs.s[i] != rhs.s[i]) + { + return false; + } + } + return true; +} + +// compare for vectors 3 +template +typename std::enable_if::is_sb_vector_size3::value, bool>::type +compare(const Ty &lhs, const Ty &rhs) +{ + for (auto i = 0; i < 3; ++i) + { + if (lhs.data.s[i] != rhs.data.s[i]) + { + return false; + } + } + return true; +} + +// compare for half vectors +template +typename std::enable_if::is_sb_vector_type::value, bool>::type +compare(const Ty &lhs, const Ty &rhs) +{ + const int size = sizeof(Ty) / sizeof(typename TypeManager::scalar_type); + for (auto i = 0; i < size; ++i) + { + if (lhs.data.s[i] != rhs.data.s[i]) + { + return false; + } + } + return true; +} + +// compare for scalars +template +typename std::enable_if::value, bool>::type +compare(const Ty &lhs, const Ty &rhs) +{ + return lhs == rhs; +} + +// compare for scalar halfs +template +typename std::enable_if::is_sb_scalar_type::value, bool>::type +compare(const Ty &lhs, const Ty &rhs) +{ + return lhs.data == rhs.data; +} + +template inline bool compare_ordered(const Ty &lhs, const Ty &rhs) +{ + return lhs == rhs; +} + +template <> +inline bool compare_ordered(const subgroups::cl_half &lhs, + const subgroups::cl_half &rhs) +{ + return cl_half_to_float(lhs.data) == cl_half_to_float(rhs.data); +} + +template +inline bool compare_ordered(const subgroups::cl_half &lhs, const int &rhs) +{ + return cl_half_to_float(lhs.data) == rhs; +} // Run a test kernel to compute the result of a built-in on an input static int run_kernel(cl_context context, cl_command_queue queue, @@ -318,6 +1211,9 @@ static int run_kernel(cl_context context, cl_command_queue queue, NULL); test_error(error, "clEnqueueWriteBuffer failed"); + error = clEnqueueWriteBuffer(queue, xy, CL_FALSE, 0, msize, mdata, 0, NULL, + NULL); + test_error(error, "clEnqueueWriteBuffer failed"); error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); test_error(error, "clEnqueueNDRangeKernel failed"); @@ -337,42 +1233,93 @@ static int run_kernel(cl_context context, cl_command_queue queue, } // Driver for testing a single built in function -template -struct test +template struct test { + static int mrun(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements, const char *kname, + const char *src, WorkGroupParams test_params) + { + int error = TEST_PASS; + for (auto &mask : test_params.all_work_item_masks) + { + test_params.work_items_mask = mask; + error |= run(device, context, queue, num_elements, kname, src, + test_params); + } + return error; + }; static int run(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements, const char *kname, - const char *src, int dynscl, bool useCoreSubgroups) + const char *src, WorkGroupParams test_params) { size_t tmp; int error; int subgroup_size, num_subgroups; size_t realSize; - size_t global; - size_t local; + size_t global = test_params.global_workgroup_size; + size_t local = test_params.local_workgroup_size; clProgramWrapper program; clKernelWrapper kernel; cl_platform_id platform; - cl_int sgmap[2 * GSIZE]; - Ty mapin[LSIZE]; - Ty mapout[LSIZE]; + std::vector sgmap; + sgmap.resize(4 * global); + std::vector mapin; + mapin.resize(local); + std::vector mapout; + mapout.resize(local); + std::stringstream kernel_sstr; + if (test_params.work_items_mask != 0) + { + kernel_sstr << "#define WORK_ITEMS_MASK "; + kernel_sstr << "0x" << std::hex << test_params.work_items_mask + << "\n"; + } + + kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS "; + kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n"; // Make sure a test of type Ty is supported by the device - if (!TypeCheck::val(device)) return 0; + if (!TypeManager::type_supported(device)) + { + log_info("Data type not supported : %s\n", TypeManager::name()); + return 0; + } + else + { + if (strstr(TypeManager::name(), "double")) + { + kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n"; + } + else if (strstr(TypeManager::name(), "half")) + { + kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_fp16: enable\n"; + } + } + + for (std::string extension : test_params.required_extensions) + { + if (!is_extension_available(device, extension.c_str())) + { + log_info("The extension %s not supported on this device. SKIP " + "testing - kernel %s data type %s\n", + extension.c_str(), kname, TypeManager::name()); + return TEST_PASS; + } + kernel_sstr << "#pragma OPENCL EXTENSION " + extension + + ": enable\n"; + } error = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), (void *)&platform, NULL); test_error(error, "clGetDeviceInfo failed for CL_DEVICE_PLATFORM"); - std::stringstream kernel_sstr; - if (useCoreSubgroups) + if (test_params.use_core_subgroups) { kernel_sstr << "#pragma OPENCL EXTENSION cl_khr_subgroups : enable\n"; } kernel_sstr << "#define XY(M,I) M[I].x = get_sub_group_local_id(); " "M[I].y = get_sub_group_id();\n"; - kernel_sstr << TypeDef::val(); + kernel_sstr << TypeManager::add_typedef(); kernel_sstr << src; const std::string &kernel_str = kernel_sstr.str(); const char *kernel_src = kernel_str.c_str(); @@ -382,16 +1329,18 @@ struct test if (error != 0) return error; // Determine some local dimensions to use for the test. - global = GSIZE; - error = get_max_common_work_group_size(context, kernel, GSIZE, &local); + error = get_max_common_work_group_size( + context, kernel, test_params.global_workgroup_size, &local); test_error(error, "get_max_common_work_group_size failed"); // Limit it a bit so we have muliple work groups - // Ideally this will still be large enough to give us multiple subgroups - if (local > LSIZE) local = LSIZE; + // Ideally this will still be large enough to give us multiple + if (local > test_params.local_workgroup_size) + local = test_params.local_workgroup_size; + // Get the sub group info - subgroupsAPI subgroupsApiSet(platform, useCoreSubgroups); + subgroupsAPI subgroupsApiSet(platform, test_params.use_core_subgroups); clGetKernelSubGroupInfoKHR_fn clGetKernelSubGroupInfo_ptr = subgroupsApiSet.clGetKernelSubGroupInfo_ptr(); if (clGetKernelSubGroupInfo_ptr == NULL) @@ -435,8 +1384,9 @@ struct test std::vector idata; std::vector odata; - size_t input_array_size = GSIZE; - size_t output_array_size = GSIZE; + size_t input_array_size = global; + size_t output_array_size = global; + int dynscl = test_params.dynsc; if (dynscl != 0) { @@ -449,28 +1399,96 @@ struct test odata.resize(output_array_size); // Run the kernel once on zeroes to get the map - memset(&idata[0], 0, input_array_size * sizeof(Ty)); - error = run_kernel(context, queue, kernel, global, local, &idata[0], - input_array_size * sizeof(Ty), sgmap, - global * sizeof(cl_int) * 2, &odata[0], + memset(idata.data(), 0, input_array_size * sizeof(Ty)); + error = run_kernel(context, queue, kernel, global, local, idata.data(), + input_array_size * sizeof(Ty), sgmap.data(), + global * sizeof(cl_int4), odata.data(), output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); - if (error) return error; + test_error(error, "Running kernel first time failed"); // Generate the desired input for the kernel - Fns::gen(&idata[0], mapin, sgmap, subgroup_size, (int)local, - (int)global / (int)local); - error = run_kernel(context, queue, kernel, global, local, &idata[0], - input_array_size * sizeof(Ty), sgmap, - global * sizeof(cl_int) * 2, &odata[0], + test_params.subgroup_size = subgroup_size; + Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params); + error = run_kernel(context, queue, kernel, global, local, idata.data(), + input_array_size * sizeof(Ty), sgmap.data(), + global * sizeof(cl_int4), odata.data(), output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); - if (error) return error; - + test_error(error, "Running kernel second time failed"); // Check the result - return Fns::chk(&idata[0], &odata[0], mapin, mapout, sgmap, - subgroup_size, (int)local, (int)global / (int)local); + error = Fns::chk(idata.data(), odata.data(), mapin.data(), + mapout.data(), sgmap.data(), test_params); + test_error(error, "Data verification failed"); + return TEST_PASS; } }; +static void set_last_workgroup_params(int non_uniform_size, + int &number_of_subgroups, + int subgroup_size, int &workgroup_size, + int &last_subgroup_size) +{ + number_of_subgroups = 1 + non_uniform_size / subgroup_size; + last_subgroup_size = non_uniform_size % subgroup_size; + workgroup_size = non_uniform_size; +} + +template +static void set_randomdata_for_subgroup(Ty *workgroup, int wg_offset, + int current_sbs) +{ + int randomize_data = (int)(genrand_int32(gMTdata) % 3); + // Initialize data matrix indexed by local id and sub group id + switch (randomize_data) + { + case 0: + memset(&workgroup[wg_offset], 0, current_sbs * sizeof(Ty)); + break; + case 1: { + memset(&workgroup[wg_offset], 0, current_sbs * sizeof(Ty)); + int wi_id = (int)(genrand_int32(gMTdata) % (cl_uint)current_sbs); + set_value(workgroup[wg_offset + wi_id], 41); + } + break; + case 2: + memset(&workgroup[wg_offset], 0xff, current_sbs * sizeof(Ty)); + break; + } +} + +struct RunTestForType +{ + RunTestForType(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements, + WorkGroupParams test_params) + : device_(device), context_(context), queue_(queue), + num_elements_(num_elements), test_params_(test_params) + {} + template + int run_impl(const char *kernel_name, const char *source) + { + int error = TEST_PASS; + if (test_params_.all_work_item_masks.size() > 0) + { + error = test::mrun(device_, context_, queue_, num_elements_, + kernel_name, source, test_params_); + } + else + { + error = test::run(device_, context_, queue_, num_elements_, + kernel_name, source, test_params_); + } + + return error; + } + +private: + cl_device_id device_; + cl_context context_; + cl_command_queue queue_; + int num_elements_; + WorkGroupParams test_params_; +}; + #endif diff --git a/test_conformance/subgroups/test_barrier.cpp b/test_conformance/subgroups/test_barrier.cpp index e6ce1d2e..47e42f65 100644 --- a/test_conformance/subgroups/test_barrier.cpp +++ b/test_conformance/subgroups/test_barrier.cpp @@ -59,10 +59,15 @@ static const char *gbar_source = // barrier test functions template struct BAR { - static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng) + static void gen(cl_int *x, cl_int *t, cl_int *m, + const WorkGroupParams &test_params) { int i, ii, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; + ng = ng / nw; int e; ii = 0; @@ -79,8 +84,7 @@ template struct BAR // Now map into work group using map from device for (j = 0; j < nw; ++j) { - i = m[2 * j + 1] * ns + m[2 * j]; - x[j] = t[i]; + x[j] = t[j]; } x += nw; @@ -89,10 +93,14 @@ template struct BAR } static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, - int ns, int nw, int ng) + const WorkGroupParams &test_params) { int ii, i, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; + ng = ng / nw; cl_int tr, rr; if (Which == 0) @@ -105,9 +113,8 @@ template struct BAR // Map to array indexed to array indexed by local ID and sub group for (j = 0; j < nw; ++j) { - i = m[2 * j + 1] * ns + m[2 * j]; - mx[i] = x[j]; - my[i] = y[j]; + mx[j] = x[j]; + my[j] = y[j]; } for (j = 0; j < nj; ++j) @@ -123,8 +130,9 @@ template struct BAR if (tr != rr) { log_error("ERROR: sub_group_barrier mismatch for local " - "id %d in sub group %d in group %d\n", - i, j, k); + "id %d in sub group %d in group %d expected " + "%d got %d\n", + i, j, k, tr, rr); return -1; } } @@ -144,18 +152,18 @@ int test_barrier_functions(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements, bool useCoreSubgroups) { - int error; + int error = TEST_PASS; // Adjust these individually below if desired/needed -#define G 2000 -#define L 200 - - error = test, G, L>::run(device, context, queue, - num_elements, "test_lbar", - lbar_source, 0, useCoreSubgroups); - error = test, G, L, G>::run( - device, context, queue, num_elements, "test_gbar", gbar_source, 0, - useCoreSubgroups); + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.use_core_subgroups = useCoreSubgroups; + error = test>::run(device, context, queue, num_elements, + "test_lbar", lbar_source, test_params); + error |= test, global_work_size>::run( + device, context, queue, num_elements, "test_gbar", gbar_source, + test_params); return error; } diff --git a/test_conformance/subgroups/test_ifp.cpp b/test_conformance/subgroups/test_ifp.cpp index 02850e5f..428f2cdc 100644 --- a/test_conformance/subgroups/test_ifp.cpp +++ b/test_conformance/subgroups/test_ifp.cpp @@ -46,7 +46,7 @@ static const char *ifp_source = "#define INST_COUNT 0x3\n" "\n" "__kernel void\n" - "test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n" + "test_ifp(const __global int *in, __global int4 *xy, __global int *out)\n" "{\n" " __local atomic_int loc[NUM_LOC];\n" "\n" @@ -225,10 +225,15 @@ void run_insts(cl_int *x, cl_int *p, int n) struct IFP { - static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng) + static void gen(cl_int *x, cl_int *t, cl_int *, + const WorkGroupParams &test_params) { int k; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; + ng = ng / nw; // We need at least 2 sub groups per group for this test if (nj == 1) return; @@ -240,11 +245,15 @@ struct IFP } } - static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns, - int nw, int ng) + static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, + const WorkGroupParams &test_params) { int i, k; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; int nj = (nw + ns - 1) / ns; + ng = ng / nw; // We need at least 2 sub groups per group for this tes if (nj == 1) return 0; @@ -275,14 +284,17 @@ struct IFP int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue, int num_elements, bool useCoreSubgroups) { - int error; + int error = TEST_PASS; + // Global/local work group sizes // Adjust these individually below if desired/needed -#define G 2000 -#define L 200 - error = test::run(device, context, queue, num_elements, - "test_ifp", ifp_source, NUM_LOC + 1, - useCoreSubgroups); + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size); + test_params.use_core_subgroups = useCoreSubgroups; + test_params.dynsc = NUM_LOC + 1; + error = test::run(device, context, queue, num_elements, + "test_ifp", ifp_source, test_params); return error; } diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp new file mode 100644 index 00000000..c0e49524 --- /dev/null +++ b/test_conformance/subgroups/test_subgroup.cpp @@ -0,0 +1,217 @@ +// +// Copyright (c) 2017 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_kernels.h" +#include "subgroup_common_templates.h" +#include "harness/conversions.h" +#include "harness/typeWrappers.h" + +namespace { +// Any/All test functions +template struct AA +{ + static void gen(cl_int *x, cl_int *t, cl_int *m, + const WorkGroupParams &test_params) + { + int i, ii, j, k, n; + int ng = test_params.global_workgroup_size; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int nj = (nw + ns - 1) / ns; + int e; + ng = ng / nw; + ii = 0; + log_info(" sub_group_%s...\n", operation_names(operation)); + for (k = 0; k < ng; ++k) + { + for (j = 0; j < nj; ++j) + { + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + e = (int)(genrand_int32(gMTdata) % 3); + + // Initialize data matrix indexed by local id and sub group id + switch (e) + { + case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break; + case 1: + memset(&t[ii], 0, n * sizeof(cl_int)); + i = (int)(genrand_int32(gMTdata) % (cl_uint)n); + t[ii + i] = 41; + break; + case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break; + } + } + + // Now map into work group using map from device + for (j = 0; j < nw; ++j) + { + x[j] = t[j]; + } + + x += nw; + m += 4 * nw; + } + } + + static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, n; + int ng = test_params.global_workgroup_size; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int nj = (nw + ns - 1) / ns; + cl_int taa, raa; + ng = ng / nw; + + for (k = 0; k < ng; ++k) + { + // Map to array indexed to array indexed by local ID and sub group + for (j = 0; j < nw; ++j) + { + mx[j] = x[j]; + my[j] = y[j]; + } + + for (j = 0; j < nj; ++j) + { + ii = j * ns; + n = ii + ns > nw ? nw - ii : ns; + + // Compute target + if (operation == NonUniformVoteOp::any) + { + taa = 0; + for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0; + } + + if (operation == NonUniformVoteOp::all) + { + taa = 1; + for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0; + } + + // Check result + for (i = 0; i < n; ++i) + { + raa = my[ii + i] != 0; + if (raa != taa) + { + log_error("ERROR: sub_group_%s mismatch for local id " + "%d in sub group %d in group %d\n", + operation_names(operation), i, j, k); + return TEST_FAIL; + } + } + } + + x += nw; + y += nw; + m += 4 * nw; + } + log_info(" sub_group_%s... passed\n", operation_names(operation)); + return TEST_PASS; + } +}; + +static const char *any_source = "__kernel void test_any(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_any(in[gid]);\n" + "}\n"; + +static const char *all_source = "__kernel void test_all(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " out[gid] = sub_group_all(in[gid]);\n" + "}\n"; + + +template +int run_broadcast_scan_reduction_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_bcast", bcast_source); + error |= rft.run_impl>("test_redadd", + redadd_source); + error |= rft.run_impl>("test_redmax", + redmax_source); + error |= rft.run_impl>("test_redmin", + redmin_source); + error |= rft.run_impl>("test_scinadd", + scinadd_source); + error |= rft.run_impl>("test_scinmax", + scinmax_source); + error |= rft.run_impl>("test_scinmin", + scinmin_source); + error |= rft.run_impl>("test_scexadd", + scexadd_source); + error |= rft.run_impl>("test_scexmax", + scexmax_source); + error |= rft.run_impl>("test_scexmin", + scexmin_source); + return error; +} + +} +// Entry point from main +int test_subgroup_functions(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements, + bool useCoreSubgroups) +{ + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size); + RunTestForType rft(device, context, queue, num_elements, test_params); + int error = + rft.run_impl>("test_any", any_source); + error |= + rft.run_impl>("test_all", all_source); + error |= run_broadcast_scan_reduction_for_type(rft); + error |= run_broadcast_scan_reduction_for_type(rft); + error |= run_broadcast_scan_reduction_for_type(rft); + error |= run_broadcast_scan_reduction_for_type(rft); + error |= run_broadcast_scan_reduction_for_type(rft); + error |= run_broadcast_scan_reduction_for_type(rft); + error |= run_broadcast_scan_reduction_for_type(rft); + return error; +} + +int test_subgroup_functions_core(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + return test_subgroup_functions(device, context, queue, num_elements, true); +} + +int test_subgroup_functions_ext(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + bool hasExtension = is_extension_available(device, "cl_khr_subgroups"); + + if (!hasExtension) + { + log_info( + "Device does not support 'cl_khr_subgroups'. Skipping the test.\n"); + return TEST_SKIPPED_ITSELF; + } + return test_subgroup_functions(device, context, queue, num_elements, false); +} diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp new file mode 100644 index 00000000..f2e4060b --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_ballot.cpp @@ -0,0 +1,1089 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_templates.h" +#include "harness/typeWrappers.h" +#include + +namespace { +// Test for ballot functions +template struct BALLOT +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + // no work here + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int non_uniform_size = gws % lws; + log_info(" sub_group_ballot...\n"); + if (non_uniform_size) + { + log_info(" non uniform work group size mode ON\n"); + } + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int wi_id, wg_id, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + int current_sbs = 0; + cl_uint expected_result, device_result; + int non_uniform_size = gws % lws; + int wg_number = gws / lws; + wg_number = non_uniform_size ? wg_number + 1 : wg_number; + int last_subgroup_size = 0; + + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + if (non_uniform_size && wg_id == wg_number - 1) + { + set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, + last_subgroup_size); + } + + for (wi_id = 0; wi_id < lws; ++wi_id) + { // inside the work_group + // read device outputs for work_group + my[wi_id] = y[wi_id]; + } + + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + if (last_subgroup_size && sb_id == sb_number - 1) + { + current_sbs = last_subgroup_size; + } + else + { + current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + } + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { + device_result = my[wg_offset + wi_id]; + expected_result = 1; + if (!compare(device_result, expected_result)) + { + log_error( + "ERROR: sub_group_ballot mismatch for local id " + "%d in sub group %d in group %d obtained {%d}, " + "expected {%d} \n", + wi_id, sb_id, wg_id, device_result, + expected_result); + return TEST_FAIL; + } + } + } + y += lws; + m += 4 * lws; + } + log_info(" sub_group_ballot... passed\n"); + return TEST_PASS; + } +}; + +// Test for bit extract ballot functions +template struct BALLOT_BIT_EXTRACT +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int wi_id, sb_id, wg_id, l; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + int wg_number = gws / lws; + int limit_sbs = sbs > 100 ? 100 : sbs; + int non_uniform_size = gws % lws; + log_info(" sub_group_%s(%s)...\n", operation_names(operation), + TypeManager::name()); + + if (non_uniform_size) + { + log_info(" non uniform work group size mode ON\n"); + } + + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + // rand index to bit extract + int index_for_odd = (int)(genrand_int32(gMTdata) & 0x7fffffff) + % (limit_sbs > current_sbs ? current_sbs : limit_sbs); + int index_for_even = (int)(genrand_int32(gMTdata) & 0x7fffffff) + % (limit_sbs > current_sbs ? current_sbs : limit_sbs); + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { + // index of the third element int the vector. + int midx = 4 * wg_offset + 4 * wi_id + 2; + // storing information about index to bit extract + m[midx] = (cl_int)index_for_odd; + m[++midx] = (cl_int)index_for_even; + } + set_randomdata_for_subgroup(t, wg_offset, current_sbs); + } + + // Now map into work group using map from device + for (wi_id = 0; wi_id < lws; ++wi_id) + { + x[wi_id] = t[wi_id]; + } + + x += lws; + m += 4 * lws; + } + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int wi_id, wg_id, l, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + int wg_number = gws / lws; + cl_uint4 expected_result, device_result; + int last_subgroup_size = 0; + int current_sbs = 0; + int non_uniform_size = gws % lws; + + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + if (non_uniform_size && wg_id == wg_number - 1) + { + set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, + last_subgroup_size); + } + // Map to array indexed to array indexed by local ID and sub group + for (wi_id = 0; wi_id < lws; ++wi_id) + { // inside the work_group + // read host inputs for work_group + mx[wi_id] = x[wi_id]; + // read device outputs for work_group + my[wi_id] = y[wi_id]; + } + + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + if (last_subgroup_size && sb_id == sb_number - 1) + { + current_sbs = last_subgroup_size; + } + else + { + current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + } + // take index of array where info which work_item will + // be broadcast its value is stored + int midx = 4 * wg_offset + 2; + // take subgroup local id of this work_item + int index_for_odd = (int)m[midx]; + int index_for_even = (int)m[++midx]; + + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { // for each subgroup + int bit_value = 0; + // from which value of bitfield bit + // verification will be done + int take_shift = + (wi_id & 1) ? index_for_odd % 32 : index_for_even % 32; + int bit_mask = 1 << take_shift; + + if (wi_id < 32) + (mx[wg_offset + wi_id].s0 & bit_mask) > 0 + ? bit_value = 1 + : bit_value = 0; + if (wi_id >= 32 && wi_id < 64) + (mx[wg_offset + wi_id].s1 & bit_mask) > 0 + ? bit_value = 1 + : bit_value = 0; + if (wi_id >= 64 && wi_id < 96) + (mx[wg_offset + wi_id].s2 & bit_mask) > 0 + ? bit_value = 1 + : bit_value = 0; + if (wi_id >= 96 && wi_id < 128) + (mx[wg_offset + wi_id].s3 & bit_mask) > 0 + ? bit_value = 1 + : bit_value = 0; + + if (wi_id & 1) + { + bit_value ? expected_result = { 1, 0, 0, 1 } + : expected_result = { 0, 0, 0, 1 }; + } + else + { + bit_value ? expected_result = { 1, 0, 0, 2 } + : expected_result = { 0, 0, 0, 2 }; + } + + device_result = my[wg_offset + wi_id]; + if (!compare(device_result, expected_result)) + { + log_error( + "ERROR: sub_group_%s mismatch for local id %d in " + "sub group %d in group %d obtained {%d, %d, %d, " + "%d}, expected {%d, %d, %d, %d}\n", + operation_names(operation), wi_id, sb_id, wg_id, + device_result.s0, device_result.s1, + device_result.s2, device_result.s3, + expected_result.s0, expected_result.s1, + expected_result.s2, expected_result.s3); + return TEST_FAIL; + } + } + } + x += lws; + y += lws; + m += 4 * lws; + } + log_info(" sub_group_%s(%s)... passed\n", operation_names(operation), + TypeManager::name()); + return TEST_PASS; + } +}; + +template struct BALLOT_INVERSE +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int non_uniform_size = gws % lws; + log_info(" sub_group_inverse_ballot...\n"); + if (non_uniform_size) + { + log_info(" non uniform work group size mode ON\n"); + } + // no work here + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int wi_id, wg_id, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + cl_uint4 expected_result, device_result; + int non_uniform_size = gws % lws; + int wg_number = gws / lws; + int last_subgroup_size = 0; + int current_sbs = 0; + if (non_uniform_size) wg_number++; + + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + if (non_uniform_size && wg_id == wg_number - 1) + { + set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, + last_subgroup_size); + } + // Map to array indexed to array indexed by local ID and sub group + for (wi_id = 0; wi_id < lws; ++wi_id) + { // inside the work_group + mx[wi_id] = x[wi_id]; // read host inputs for work_group + my[wi_id] = y[wi_id]; // read device outputs for work_group + } + + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + if (last_subgroup_size && sb_id == sb_number - 1) + { + current_sbs = last_subgroup_size; + } + else + { + current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + } + // take index of array where info which work_item will + // be broadcast its value is stored + int midx = 4 * wg_offset + 2; + // take subgroup local id of this work_item + // Check result + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { // for each subgroup work item + + wi_id & 1 ? expected_result = { 1, 0, 0, 1 } + : expected_result = { 1, 0, 0, 2 }; + + device_result = my[wg_offset + wi_id]; + if (!compare(device_result, expected_result)) + { + log_error( + "ERROR: sub_group_%s mismatch for local id %d in " + "sub group %d in group %d obtained {%d, %d, %d, " + "%d}, expected {%d, %d, %d, %d}\n", + operation_names(operation), wi_id, sb_id, wg_id, + device_result.s0, device_result.s1, + device_result.s2, device_result.s3, + expected_result.s0, expected_result.s1, + expected_result.s2, expected_result.s3); + return TEST_FAIL; + } + } + } + x += lws; + y += lws; + m += 4 * lws; + } + + log_info(" sub_group_inverse_ballot... passed\n"); + return TEST_PASS; + } +}; + + +// Test for bit count/inclusive and exclusive scan/ find lsb msb ballot function +template struct BALLOT_COUNT_SCAN_FIND +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int wi_id, wg_id, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + int non_uniform_size = gws % lws; + int wg_number = gws / lws; + int last_subgroup_size = 0; + int current_sbs = 0; + + log_info(" sub_group_%s(%s)...\n", operation_names(operation), + TypeManager::name()); + if (non_uniform_size) + { + log_info(" non uniform work group size mode ON\n"); + wg_number++; + } + int e; + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + if (non_uniform_size && wg_id == wg_number - 1) + { + set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, + last_subgroup_size); + } + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + if (last_subgroup_size && sb_id == sb_number - 1) + { + current_sbs = last_subgroup_size; + } + else + { + current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + } + if (operation == BallotOp::ballot_bit_count + || operation == BallotOp::ballot_inclusive_scan + || operation == BallotOp::ballot_exclusive_scan) + { + set_randomdata_for_subgroup(t, wg_offset, current_sbs); + } + else if (operation == BallotOp::ballot_find_lsb + || operation == BallotOp::ballot_find_msb) + { + // Regarding to the spec, find lsb and find msb result is + // undefined behavior if input value is zero, so generate + // only non-zero values. + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { + char x = (genrand_int32(gMTdata)) & 0xff; + // undefined behaviour in case of 0; + x = x ? x : 1; + memset(&t[wg_offset + wi_id], x, sizeof(Ty)); + } + } + else + { + log_error("Unknown operation..."); + } + } + + // Now map into work group using map from device + for (wi_id = 0; wi_id < lws; ++wi_id) + { + x[wi_id] = t[wi_id]; + } + + x += lws; + m += 4 * lws; + } + } + + static bs128 getImportantBits(cl_uint sub_group_local_id, + cl_uint sub_group_size) + { + bs128 mask; + if (operation == BallotOp::ballot_bit_count + || operation == BallotOp::ballot_find_lsb + || operation == BallotOp::ballot_find_msb) + { + for (cl_uint i = 0; i < sub_group_size; ++i) mask.set(i); + } + else if (operation == BallotOp::ballot_inclusive_scan + || operation == BallotOp::ballot_exclusive_scan) + { + for (cl_uint i = 0; i <= sub_group_local_id; ++i) mask.set(i); + if (operation == BallotOp::ballot_exclusive_scan) + mask.reset(sub_group_local_id); + } + return mask; + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int wi_id, wg_id, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + int non_uniform_size = gws % lws; + int wg_number = gws / lws; + wg_number = non_uniform_size ? wg_number + 1 : wg_number; + cl_uint4 expected_result, device_result; + int last_subgroup_size = 0; + int current_sbs = 0; + + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + if (non_uniform_size && wg_id == wg_number - 1) + { + set_last_workgroup_params(non_uniform_size, sb_number, sbs, lws, + last_subgroup_size); + } + // Map to array indexed to array indexed by local ID and sub group + for (wi_id = 0; wi_id < lws; ++wi_id) + { // inside the work_group + // read host inputs for work_group + mx[wi_id] = x[wi_id]; + // read device outputs for work_group + my[wi_id] = y[wi_id]; + } + + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + if (last_subgroup_size && sb_id == sb_number - 1) + { + current_sbs = last_subgroup_size; + } + else + { + current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + } + // Check result + expected_result = { 0, 0, 0, 0 }; + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { // for subgroup element + bs128 bs; + // convert cl_uint4 input into std::bitset<128> + bs |= bs128(mx[wg_offset + wi_id].s0) + | (bs128(mx[wg_offset + wi_id].s1) << 32) + | (bs128(mx[wg_offset + wi_id].s2) << 64) + | (bs128(mx[wg_offset + wi_id].s3) << 96); + bs &= getImportantBits(wi_id, current_sbs); + device_result = my[wg_offset + wi_id]; + if (operation == BallotOp::ballot_inclusive_scan + || operation == BallotOp::ballot_exclusive_scan + || operation == BallotOp::ballot_bit_count) + { + expected_result.s0 = bs.count(); + if (!compare(device_result, expected_result)) + { + log_error("ERROR: sub_group_%s " + "mismatch for local id %d in sub group " + "%d in group %d obtained {%d, %d, %d, " + "%d}, expected {%d, %d, %d, %d}\n", + operation_names(operation), wi_id, sb_id, + wg_id, device_result.s0, device_result.s1, + device_result.s2, device_result.s3, + expected_result.s0, expected_result.s1, + expected_result.s2, expected_result.s3); + return TEST_FAIL; + } + } + else if (operation == BallotOp::ballot_find_lsb) + { + for (int id = 0; id < current_sbs; ++id) + { + if (bs.test(id)) + { + expected_result.s0 = id; + break; + } + } + if (!compare(device_result, expected_result)) + { + log_error("ERROR: sub_group_ballot_find_lsb " + "mismatch for local id %d in sub group " + "%d in group %d obtained {%d, %d, %d, " + "%d}, expected {%d, %d, %d, %d}\n", + wi_id, sb_id, wg_id, device_result.s0, + device_result.s1, device_result.s2, + device_result.s3, expected_result.s0, + expected_result.s1, expected_result.s2, + expected_result.s3); + return TEST_FAIL; + } + } + else if (operation == BallotOp::ballot_find_msb) + { + for (int id = current_sbs - 1; id >= 0; --id) + { + if (bs.test(id)) + { + expected_result.s0 = id; + break; + } + } + if (!compare(device_result, expected_result)) + { + log_error("ERROR: sub_group_ballot_find_msb " + "mismatch for local id %d in sub group " + "%d in group %d obtained {%d, %d, %d, " + "%d}, expected {%d, %d, %d, %d}\n", + wi_id, sb_id, wg_id, device_result.s0, + device_result.s1, device_result.s2, + device_result.s3, expected_result.s0, + expected_result.s1, expected_result.s2, + expected_result.s3); + return TEST_FAIL; + } + } + } + } + x += lws; + y += lws; + m += 4 * lws; + } + log_info(" sub_group_ballot_%s(%s)... passed\n", + operation_names(operation), TypeManager::name()); + return TEST_PASS; + } +}; + +// test mask functions +template struct SMASK +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int wi_id, wg_id, l, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + int wg_number = gws / lws; + log_info(" get_sub_group_%s_mask...\n", operation_names(operation)); + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { // for each subgroup + int wg_offset = sb_id * sbs; + int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + // Produce expected masks for each work item in the subgroup + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { + int midx = 4 * wg_offset + 4 * wi_id; + cl_uint max_sub_group_size = m[midx + 2]; + cl_uint4 expected_mask = { 0 }; + expected_mask = generate_bit_mask( + wi_id, operation_names(operation), max_sub_group_size); + set_value(t[wg_offset + wi_id], expected_mask); + } + } + + // Now map into work group using map from device + for (wi_id = 0; wi_id < lws; ++wi_id) + { + x[wi_id] = t[wi_id]; + } + x += lws; + m += 4 * lws; + } + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int wi_id, wg_id, sb_id; + int gws = test_params.global_workgroup_size; + int lws = test_params.local_workgroup_size; + int sbs = test_params.subgroup_size; + int sb_number = (lws + sbs - 1) / sbs; + Ty expected_result, device_result; + int wg_number = gws / lws; + + for (wg_id = 0; wg_id < wg_number; ++wg_id) + { // for each work_group + for (wi_id = 0; wi_id < lws; ++wi_id) + { // inside the work_group + mx[wi_id] = x[wi_id]; // read host inputs for work_group + my[wi_id] = y[wi_id]; // read device outputs for work_group + } + + for (sb_id = 0; sb_id < sb_number; ++sb_id) + { + int wg_offset = sb_id * sbs; + int current_sbs = wg_offset + sbs > lws ? lws - wg_offset : sbs; + + // Check result + for (wi_id = 0; wi_id < current_sbs; ++wi_id) + { // inside the subgroup + expected_result = + mx[wg_offset + wi_id]; // read host input for subgroup + device_result = + my[wg_offset + + wi_id]; // read device outputs for subgroup + if (!compare(device_result, expected_result)) + { + log_error("ERROR: get_sub_group_%s_mask... mismatch " + "for local id %d in sub group %d in group " + "%d, obtained %d, expected %d\n", + operation_names(operation), wi_id, sb_id, + wg_id, device_result, expected_result); + return TEST_FAIL; + } + } + } + x += lws; + y += lws; + m += 4 * lws; + } + log_info(" get_sub_group_%s_mask... passed\n", + operation_names(operation)); + return TEST_PASS; + } +}; + +static const char *bcast_non_uniform_source = + "__kernel void test_bcast_non_uniform(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" + " out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n" + " } else {\n" + " out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n" + " }\n" + "}\n"; + +static const char *bcast_first_source = + "__kernel void test_bcast_first(const __global Type *in, __global int4 " + "*xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" + " out[gid] = sub_group_broadcast_first(x);\n" + " } else {\n" + " out[gid] = sub_group_broadcast_first(x);\n" + " }\n" + "}\n"; + +static const char *ballot_bit_count_source = + "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint4 value = (uint4)(0,0,0,0);\n" + " value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n" + " out[gid] = value;\n" + "}\n"; + +static const char *ballot_inclusive_scan_source = + "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint4 value = (uint4)(0,0,0,0);\n" + " value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n" + " out[gid] = value;\n" + "}\n"; + +static const char *ballot_exclusive_scan_source = + "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type " + "*in, __global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint4 value = (uint4)(0,0,0,0);\n" + " value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n" + " out[gid] = value;\n" + "}\n"; + +static const char *ballot_find_lsb_source = + "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint4 value = (uint4)(0,0,0,0);\n" + " value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n" + " out[gid] = value;\n" + "}\n"; + +static const char *ballot_find_msb_source = + "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint4 value = (uint4)(0,0,0,0);" + " value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);" + " out[gid] = value ;" + "}\n"; + +static const char *get_subgroup_ge_mask_source = + "__kernel void test_get_sub_group_ge_mask(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].z = get_max_sub_group_size();\n" + " Type x = in[gid];\n" + " uint4 mask = get_sub_group_ge_mask();" + " out[gid] = mask;\n" + "}\n"; + +static const char *get_subgroup_gt_mask_source = + "__kernel void test_get_sub_group_gt_mask(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].z = get_max_sub_group_size();\n" + " Type x = in[gid];\n" + " uint4 mask = get_sub_group_gt_mask();" + " out[gid] = mask;\n" + "}\n"; + +static const char *get_subgroup_le_mask_source = + "__kernel void test_get_sub_group_le_mask(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].z = get_max_sub_group_size();\n" + " Type x = in[gid];\n" + " uint4 mask = get_sub_group_le_mask();" + " out[gid] = mask;\n" + "}\n"; + +static const char *get_subgroup_lt_mask_source = + "__kernel void test_get_sub_group_lt_mask(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].z = get_max_sub_group_size();\n" + " Type x = in[gid];\n" + " uint4 mask = get_sub_group_lt_mask();" + " out[gid] = mask;\n" + "}\n"; + +static const char *get_subgroup_eq_mask_source = + "__kernel void test_get_sub_group_eq_mask(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].z = get_max_sub_group_size();\n" + " Type x = in[gid];\n" + " uint4 mask = get_sub_group_eq_mask();" + " out[gid] = mask;\n" + "}\n"; + +static const char *ballot_source = + "__kernel void test_sub_group_ballot(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + "uint4 full_ballot = sub_group_ballot(1);\n" + "uint divergence_mask;\n" + "uint4 partial_ballot;\n" + "uint gid = get_global_id(0);" + "XY(xy,gid);\n" + "if (get_sub_group_local_id() & 1) {\n" + " divergence_mask = 0xaaaaaaaa;\n" + " partial_ballot = sub_group_ballot(1);\n" + "} else {\n" + " divergence_mask = 0x55555555;\n" + " partial_ballot = sub_group_ballot(1);\n" + "}\n" + " size_t lws = get_local_size(0);\n" + "uint4 masked_ballot = full_ballot;\n" + "masked_ballot.x &= divergence_mask;\n" + "masked_ballot.y &= divergence_mask;\n" + "masked_ballot.z &= divergence_mask;\n" + "masked_ballot.w &= divergence_mask;\n" + "out[gid] = all(masked_ballot == partial_ballot);\n" + + "} \n"; + +static const char *ballot_source_inverse = + "__kernel void test_sub_group_ballot_inverse(const __global " + "Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint4 value = (uint4)(10,0,0,0);\n" + " if (get_sub_group_local_id() & 1) {" + " uint4 partial_ballot_mask = " + "(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);" + " if (sub_group_inverse_ballot(partial_ballot_mask)) {\n" + " value = (uint4)(1,0,0,1);\n" + " } else {\n" + " value = (uint4)(0,0,0,1);\n" + " }\n" + " } else {\n" + " uint4 partial_ballot_mask = " + "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);" + " if (sub_group_inverse_ballot(partial_ballot_mask)) {\n" + " value = (uint4)(1,0,0,2);\n" + " } else {\n" + " value = (uint4)(0,0,0,2);\n" + " }\n" + " }\n" + " out[gid] = value;\n" + "}\n"; + +static const char *ballot_bit_extract_source = + "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " uint index = xy[gid].z;\n" + " uint4 value = (uint4)(10,0,0,0);\n" + " if (get_sub_group_local_id() & 1) {" + " if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n" + " value = (uint4)(1,0,0,1);\n" + " } else {\n" + " value = (uint4)(0,0,0,1);\n" + " }\n" + " } else {\n" + " if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n" + " value = (uint4)(1,0,0,2);\n" + " } else {\n" + " value = (uint4)(0,0,0,2);\n" + " }\n" + " }\n" + " out[gid] = value;\n" + "}\n"; + +template int run_non_uniform_broadcast_for_type(RunTestForType rft) +{ + int error = + rft.run_impl>( + "test_bcast_non_uniform", bcast_non_uniform_source); + return error; +} + + +} + +int test_subgroup_functions_ballot(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + std::vector required_extensions = { "cl_khr_subgroup_ballot" }; + constexpr size_t global_work_size = 170; + constexpr size_t local_work_size = 64; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions); + RunTestForType rft(device, context, queue, num_elements, test_params); + + // non uniform broadcast functions + int error = run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + error |= run_non_uniform_broadcast_for_type(rft); + + // broadcast first functions + error |= + rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl>( + "test_bcast_first", bcast_first_source); + error |= rft.run_impl< + subgroups::cl_half, + BC>( + "test_bcast_first", bcast_first_source); + + // mask functions + error |= rft.run_impl>( + "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source); + error |= rft.run_impl>( + "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source); + error |= rft.run_impl>( + "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source); + error |= rft.run_impl>( + "test_get_sub_group_le_mask", get_subgroup_le_mask_source); + error |= rft.run_impl>( + "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source); + + // ballot functions + error |= rft.run_impl>("test_sub_group_ballot", + ballot_source); + error |= rft.run_impl>( + "test_sub_group_ballot_inverse", ballot_source_inverse); + error |= rft.run_impl< + cl_uint4, BALLOT_BIT_EXTRACT>( + "test_sub_group_ballot_bit_extract", ballot_bit_extract_source); + error |= rft.run_impl< + cl_uint4, BALLOT_COUNT_SCAN_FIND>( + "test_sub_group_ballot_bit_count", ballot_bit_count_source); + error |= rft.run_impl< + cl_uint4, + BALLOT_COUNT_SCAN_FIND>( + "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source); + error |= rft.run_impl< + cl_uint4, + BALLOT_COUNT_SCAN_FIND>( + "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source); + error |= rft.run_impl< + cl_uint4, BALLOT_COUNT_SCAN_FIND>( + "test_sub_group_ballot_find_lsb", ballot_find_lsb_source); + error |= rft.run_impl< + cl_uint4, BALLOT_COUNT_SCAN_FIND>( + "test_sub_group_ballot_find_msb", ballot_find_msb_source); + return error; +} diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp new file mode 100644 index 00000000..588e9cee --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp @@ -0,0 +1,340 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_templates.h" +#include "harness/typeWrappers.h" + +#define CLUSTER_SIZE 4 +#define CLUSTER_SIZE_STR "4" + +namespace { +static const char *redadd_clustered_source = + "__kernel void test_redadd_clustered(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redmax_clustered_source = + "__kernel void test_redmax_clustered(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redmin_clustered_source = + "__kernel void test_redmin_clustered(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redmul_clustered_source = + "__kernel void test_redmul_clustered(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redand_clustered_source = + "__kernel void test_redand_clustered(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redor_clustered_source = + "__kernel void test_redor_clustered(const __global Type *in, __global int4 " + "*xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redxor_clustered_source = + "__kernel void test_redxor_clustered(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR + ");\n" + "}\n"; + +static const char *redand_clustered_logical_source = + "__kernel void test_redand_clustered_logical(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR + ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = " + "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n" + "}\n"; + +static const char *redor_clustered_logical_source = + "__kernel void test_redor_clustered_logical(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if (sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR + ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = " + "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n" + "}\n"; + +static const char *redxor_clustered_logical_source = + "__kernel void test_redxor_clustered_logical(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " xy[gid].w = 0;\n" + " if ( sizeof(in[gid]) == " + "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR + ")))\n" + " {xy[gid].w = sizeof(in[gid]);}\n" + " out[gid] = " + "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n" + "}\n"; + + +// DESCRIPTION: +// Test for reduce cluster functions +template struct RED_CLU +{ + static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params) + { + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + ng = ng / nw; + log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ...\n", + operation_names(operation), TypeManager::name(), + sizeof(Ty)); + genrand(x, t, m, ns, nw, ng); + } + + static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, + const WorkGroupParams &test_params) + { + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + int nj = (nw + ns - 1) / ns; + ng = ng / nw; + + for (int k = 0; k < ng; ++k) + { + std::vector data_type_sizes; + // Map to array indexed to array indexed by local ID and sub group + for (int j = 0; j < nw; ++j) + { + mx[j] = x[j]; + my[j] = y[j]; + data_type_sizes.push_back(m[4 * j + 3]); + } + + for (cl_int dts : data_type_sizes) + { + if (dts != sizeof(Ty)) + { + log_error("ERROR: sub_group_clustered_reduce_%s(%s) " + "wrong data type size detected, expected: %d, " + "used by device %d, in group %d\n", + operation_names(operation), + TypeManager::name(), sizeof(Ty), dts, k); + return TEST_FAIL; + } + } + + for (int j = 0; j < nj; ++j) + { + int ii = j * ns; + int n = ii + ns > nw ? nw - ii : ns; + int midx = 4 * ii + 2; + std::vector clusters_results; + int clusters_counter = ns / CLUSTER_SIZE; + clusters_results.resize(clusters_counter); + + // Compute target + Ty tr = mx[ii]; + for (int i = 0; i < n; ++i) + { + if (i % CLUSTER_SIZE == 0) + tr = mx[ii + i]; + else + tr = calculate(tr, mx[ii + i], operation); + clusters_results[i / CLUSTER_SIZE] = tr; + } + + // Check result + for (int i = 0; i < n; ++i) + { + Ty rr = my[ii + i]; + tr = clusters_results[i / CLUSTER_SIZE]; + if (!compare(rr, tr)) + { + log_error( + "ERROR: sub_group_clustered_reduce_%s(%s) mismatch " + "for local id %d in sub group %d in group %d\n", + operation_names(operation), TypeManager::name(), + i, j, k); + return TEST_FAIL; + } + } + } + + x += nw; + y += nw; + m += 4 * nw; + } + log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n", + operation_names(operation), TypeManager::name(), + sizeof(Ty)); + return TEST_PASS; + } +}; + +template +int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_redadd_clustered", redadd_clustered_source); + error |= rft.run_impl>( + "test_redmax_clustered", redmax_clustered_source); + error |= rft.run_impl>( + "test_redmin_clustered", redmin_clustered_source); + error |= rft.run_impl>( + "test_redmul_clustered", redmul_clustered_source); + return error; +} +template int run_cluster_and_or_xor_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_redand_clustered", redand_clustered_source); + error |= rft.run_impl>( + "test_redor_clustered", redor_clustered_source); + error |= rft.run_impl>( + "test_redxor_clustered", redxor_clustered_source); + return error; +} +template +int run_cluster_logical_and_or_xor_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_redand_clustered_logical", redand_clustered_logical_source); + error |= rft.run_impl>( + "test_redor_clustered_logical", redor_clustered_logical_source); + error |= rft.run_impl>( + "test_redxor_clustered_logical", redxor_clustered_logical_source); + + return error; +} +} + +int test_subgroup_functions_clustered_reduce(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + std::vector required_extensions = { + "cl_khr_subgroup_clustered_reduce" + }; + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + error |= run_cluster_red_add_max_min_mul_for_type(rft); + + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + error |= run_cluster_and_or_xor_for_type(rft); + + error |= run_cluster_logical_and_or_xor_for_type(rft); + return error; +} diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp new file mode 100644 index 00000000..98401b8e --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp @@ -0,0 +1,138 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_kernels.h" +#include "subgroup_common_templates.h" +#include "harness/typeWrappers.h" + +namespace { + +template int run_broadcast_for_extended_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_bcast", bcast_source); + return error; +} + +template int run_scan_reduction_for_type(RunTestForType rft) +{ + int error = rft.run_impl>("test_redadd", + redadd_source); + error |= rft.run_impl>("test_redmax", + redmax_source); + error |= rft.run_impl>("test_redmin", + redmin_source); + error |= rft.run_impl>("test_scinadd", + scinadd_source); + error |= rft.run_impl>("test_scinmax", + scinmax_source); + error |= rft.run_impl>("test_scinmin", + scinmin_source); + error |= rft.run_impl>("test_scexadd", + scexadd_source); + error |= rft.run_impl>("test_scexmax", + scexmax_source); + error |= rft.run_impl>("test_scexmin", + scexmin_source); + return error; +} + + +} + +int test_subgroup_functions_extended_types(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + std::vector required_extensions = { + "cl_khr_subgroup_extended_types" + }; + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + error |= run_broadcast_for_extended_type(rft); + + error |= run_scan_reduction_for_type(rft); + error |= run_scan_reduction_for_type(rft); + error |= run_scan_reduction_for_type(rft); + error |= run_scan_reduction_for_type(rft); + return error; +} diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp new file mode 100644 index 00000000..eb46ff09 --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp @@ -0,0 +1,473 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "harness/typeWrappers.h" +#include "subgroup_common_templates.h" + +namespace { + +static const char *scinadd_non_uniform_source = R"( + __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]); + } + } +)"; + +static const char *scinmax_non_uniform_source = R"( + __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]); + } + } +)"; + +static const char *scinmin_non_uniform_source = R"( + __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]); + } + } +)"; + +static const char *scinmul_non_uniform_source = R"( + __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]); + } + } +)"; + +static const char *scinand_non_uniform_source = R"( + __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]); + } + } +)"; + +static const char *scinor_non_uniform_source = R"( + __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]); + } + } +)"; + +static const char *scinxor_non_uniform_source = R"( + __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]); + } + } +)"; + +static const char *scinand_non_uniform_logical_source = R"( + __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]); + } + } +)"; + +static const char *scinor_non_uniform_logical_source = R"( + __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]); + } + } +)"; + +static const char *scinxor_non_uniform_logical_source = R"( + __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]); + } + } +)"; + +static const char *scexadd_non_uniform_source = R"( + __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]); + } + } +)"; + +static const char *scexmax_non_uniform_source = R"( + __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]); + } + } +)"; + +static const char *scexmin_non_uniform_source = R"( + __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]); + } + } +)"; + +static const char *scexmul_non_uniform_source = R"( + __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]); + } + } +)"; + +static const char *scexand_non_uniform_source = R"( + __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]); + } + } +)"; + +static const char *scexor_non_uniform_source = R"( + __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]); + } + } +)"; + +static const char *scexxor_non_uniform_source = R"( + __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]); + } + } +)"; + +static const char *scexand_non_uniform_logical_source = R"( + __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]); + } + } +)"; + +static const char *scexor_non_uniform_logical_source = R"( + __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]); + } + } +)"; + +static const char *scexxor_non_uniform_logical_source = R"( + __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]); + } + } +)"; + +static const char *redadd_non_uniform_source = R"( + __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_add(in[gid]); + } + } +)"; + +static const char *redmax_non_uniform_source = R"( + __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_max(in[gid]); + } + } +)"; + +static const char *redmin_non_uniform_source = R"( + __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_min(in[gid]); + } + } +)"; + +static const char *redmul_non_uniform_source = R"( + __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_mul(in[gid]); + } + } +)"; + +static const char *redand_non_uniform_source = R"( + __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_and(in[gid]); + } + } +)"; + +static const char *redor_non_uniform_source = R"( + __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_or(in[gid]); + } + } +)"; + +static const char *redxor_non_uniform_source = R"( + __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_xor(in[gid]); + } + } +)"; + +static const char *redand_non_uniform_logical_source = R"( + __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]); + } + } +)"; + +static const char *redor_non_uniform_logical_source = R"( + __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]); + } + } +)"; + +static const char *redxor_non_uniform_logical_source = R"( + __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + int elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]); + } + } +)"; + +template +int run_functions_add_mul_max_min_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_scinadd_non_uniform", scinadd_non_uniform_source); + error |= rft.run_impl>( + "test_scinmul_non_uniform", scinmul_non_uniform_source); + error |= rft.run_impl>( + "test_scinmax_non_uniform", scinmax_non_uniform_source); + error |= rft.run_impl>( + "test_scinmin_non_uniform", scinmin_non_uniform_source); + error |= rft.run_impl>( + "test_scexadd_non_uniform", scexadd_non_uniform_source); + error |= rft.run_impl>( + "test_scexmul_non_uniform", scexmul_non_uniform_source); + error |= rft.run_impl>( + "test_scexmax_non_uniform", scexmax_non_uniform_source); + error |= rft.run_impl>( + "test_scexmin_non_uniform", scexmin_non_uniform_source); + error |= rft.run_impl>( + "test_redadd_non_uniform", redadd_non_uniform_source); + error |= rft.run_impl>( + "test_redmul_non_uniform", redmul_non_uniform_source); + error |= rft.run_impl>( + "test_redmax_non_uniform", redmax_non_uniform_source); + error |= rft.run_impl>( + "test_redmin_non_uniform", redmin_non_uniform_source); + return error; +} + +template int run_functions_and_or_xor_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_scinand_non_uniform", scinand_non_uniform_source); + error |= rft.run_impl>( + "test_scinor_non_uniform", scinor_non_uniform_source); + error |= rft.run_impl>( + "test_scinxor_non_uniform", scinxor_non_uniform_source); + error |= rft.run_impl>( + "test_scexand_non_uniform", scexand_non_uniform_source); + error |= rft.run_impl>( + "test_scexor_non_uniform", scexor_non_uniform_source); + error |= rft.run_impl>( + "test_scexxor_non_uniform", scexxor_non_uniform_source); + error |= rft.run_impl>( + "test_redand_non_uniform", redand_non_uniform_source); + error |= rft.run_impl>( + "test_redor_non_uniform", redor_non_uniform_source); + error |= rft.run_impl>( + "test_redxor_non_uniform", redxor_non_uniform_source); + return error; +} + +template +int run_functions_logical_and_or_xor_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source); + error |= rft.run_impl>( + "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source); + error |= rft.run_impl>( + "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source); + error |= rft.run_impl>( + "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source); + error |= rft.run_impl>( + "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source); + error |= rft.run_impl>( + "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source); + error |= rft.run_impl>( + "test_redand_non_uniform_logical", redand_non_uniform_logical_source); + error |= rft.run_impl>( + "test_redor_non_uniform_logical", redor_non_uniform_logical_source); + error |= rft.run_impl>( + "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source); + return error; +} + +} + +int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + std::vector required_extensions = { + "cl_khr_subgroup_non_uniform_arithmetic" + }; + std::vector masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555, + 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00, + 0x00ffff00, 0x80000000, 0xaaaaaaaa }; + + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions, masks); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + error |= run_functions_add_mul_max_min_for_type(rft); + + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + error |= run_functions_and_or_xor_for_type(rft); + + error |= run_functions_logical_and_or_xor_for_type(rft); + return error; +} \ No newline at end of file diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp new file mode 100644 index 00000000..2b00b4dd --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp @@ -0,0 +1,303 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "harness/typeWrappers.h" +#include + +namespace { + +template struct VOTE +{ + static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params) + { + int i, ii, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; + int non_uniform_size = ng % nw; + ng = ng / nw; + int last_subgroup_size = 0; + ii = 0; + + log_info(" sub_group_%s%s... \n", + (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_", + operation_names(operation)); + + log_info(" test params: global size = %d local size = %d subgroups " + "size = %d work item mask = 0x%x data type (%s)\n", + test_params.global_workgroup_size, nw, ns, work_items_mask, + TypeManager::name()); + if (non_uniform_size) + { + log_info(" non uniform work group size mode ON\n"); + } + if (operation == NonUniformVoteOp::elect) return; + + for (k = 0; k < ng; ++k) + { // for each work_group + if (non_uniform_size && k == ng - 1) + { + set_last_workgroup_params(non_uniform_size, nj, ns, nw, + last_subgroup_size); + } + for (j = 0; j < nj; ++j) + { // for each subgroup + ii = j * ns; + if (last_subgroup_size && j == nj - 1) + { + n = last_subgroup_size; + } + else + { + n = ii + ns > nw ? nw - ii : ns; + } + int e = genrand_int32(gMTdata) % 3; + + for (i = 0; i < n; i++) + { + if (e == 2) + { // set once 0 and once 1 alternately + int value = i % 2; + set_value(t[ii + i], value); + } + else + { // set 0/1 for all work items in subgroup + set_value(t[ii + i], e); + } + } + } + // Now map into work group using map from device + for (j = 0; j < nw; ++j) + { + x[j] = t[j]; + } + x += nw; + m += 4 * nw; + } + } + + static int chk(T *x, T *y, T *mx, T *my, cl_int *m, + const WorkGroupParams &test_params) + { + int ii, i, j, k, n; + int nw = test_params.local_workgroup_size; + int ns = test_params.subgroup_size; + int ng = test_params.global_workgroup_size; + uint32_t work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; + cl_int tr, rr; + int non_uniform_size = ng % nw; + ng = ng / nw; + if (non_uniform_size) ng++; + int last_subgroup_size = 0; + + for (k = 0; k < ng; ++k) + { // for each work_group + if (non_uniform_size && k == ng - 1) + { + set_last_workgroup_params(non_uniform_size, nj, ns, nw, + last_subgroup_size); + } + for (j = 0; j < nw; ++j) + { // inside the work_group + mx[j] = x[j]; // read host inputs for work_group + my[j] = y[j]; // read device outputs for work_group + } + + for (j = 0; j < nj; ++j) + { // for each subgroup + ii = j * ns; + if (last_subgroup_size && j == nj - 1) + { + n = last_subgroup_size; + } + else + { + n = ii + ns > nw ? nw - ii : ns; + } + + rr = 0; + if (operation == NonUniformVoteOp::all + || operation == NonUniformVoteOp::all_equal) + tr = 1; + if (operation == NonUniformVoteOp::any) tr = 0; + + std::set active_work_items; + for (i = 0; i < n; ++i) + { + uint32_t check_work_item = 1 << (i % 32); + if (work_items_mask & check_work_item) + { + active_work_items.insert(i); + switch (operation) + { + case NonUniformVoteOp::elect: break; + + case NonUniformVoteOp::all: + tr &= + !compare_ordered(mx[ii + i], 0) ? 1 : 0; + break; + case NonUniformVoteOp::any: + tr |= + !compare_ordered(mx[ii + i], 0) ? 1 : 0; + break; + case NonUniformVoteOp::all_equal: + tr &= compare_ordered( + mx[ii + i], + mx[ii + *active_work_items.begin()]) + ? 1 + : 0; + break; + default: + log_error("Unknown operation\n"); + return TEST_FAIL; + } + } + } + if (active_work_items.empty()) + { + log_info(" no one workitem acitve... in workgroup id = %d " + "subgroup id = %d\n", + k, j); + } + else + { + auto lowest_active = active_work_items.begin(); + for (const int &active_work_item : active_work_items) + { + i = active_work_item; + if (operation == NonUniformVoteOp::elect) + { + i == *lowest_active ? tr = 1 : tr = 0; + } + + // normalize device values on host, non zero set 1. + rr = compare_ordered(my[ii + i], 0) ? 0 : 1; + + if (rr != tr) + { + log_error("ERROR: sub_group_%s() \n", + operation_names(operation)); + log_error( + "mismatch for work item %d sub group %d in " + "work group %d. Expected: %d Obtained: %d\n", + i, j, k, tr, rr); + return TEST_FAIL; + } + } + } + } + + x += nw; + y += nw; + m += 4 * nw; + } + + log_info(" sub_group_%s%s... passed\n", + (operation == NonUniformVoteOp::elect) ? "" : "non_uniform_", + operation_names(operation)); + return TEST_PASS; + } +}; +static const char *elect_source = R"( + __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + uint elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_elect(); + } + } +)"; + +static const char *non_uniform_any_source = R"( + __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + uint elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_any(in[gid]); + } + } +)"; + +static const char *non_uniform_all_source = R"( + __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + uint elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_all(in[gid]); + } + } +)"; + +static const char *non_uniform_all_equal_source = R"( + __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + uint elect_work_item = 1 << (get_sub_group_local_id() % 32); + if (elect_work_item & WORK_ITEMS_MASK){ + out[gid] = sub_group_non_uniform_all_equal(in[gid]); + } + } +)"; + +template int run_vote_all_equal_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_non_uniform_all_equal", non_uniform_all_equal_source); + return error; +} +} + +int test_subgroup_functions_non_uniform_vote(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + std::vector required_extensions = { + "cl_khr_subgroup_non_uniform_vote" + }; + + std::vector masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555, + 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00, + 0x00ffff00, 0x80000000 }; + constexpr size_t global_work_size = 170; + constexpr size_t local_work_size = 64; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions, masks); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_vote_all_equal_for_type(rft); + error |= run_vote_all_equal_for_type(rft); + error |= run_vote_all_equal_for_type(rft); + error |= run_vote_all_equal_for_type(rft); + error |= run_vote_all_equal_for_type(rft); + error |= run_vote_all_equal_for_type(rft); + error |= run_vote_all_equal_for_type(rft); + + error |= rft.run_impl>( + "test_non_uniform_all", non_uniform_all_source); + error |= rft.run_impl>( + "test_elect", elect_source); + error |= rft.run_impl>( + "test_non_uniform_any", non_uniform_any_source); + return error; +} diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp new file mode 100644 index 00000000..049f0982 --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp @@ -0,0 +1,78 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_templates.h" +#include "harness/typeWrappers.h" +#include + +namespace { + +static const char* shuffle_xor_source = + "__kernel void test_sub_group_shuffle_xor(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " out[gid] = sub_group_shuffle_xor(x, xy[gid].z);" + "}\n"; + +static const char* shuffle_source = + "__kernel void test_sub_group_shuffle(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " out[gid] = sub_group_shuffle(x, xy[gid].z);" + "}\n"; + +template int run_shuffle_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_sub_group_shuffle", shuffle_source); + error |= rft.run_impl>( + "test_sub_group_shuffle_xor", shuffle_xor_source); + return error; +} + +} + +int test_subgroup_functions_shuffle(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements) +{ + std::vector required_extensions{ "cl_khr_subgroup_shuffle" }; + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + error |= run_shuffle_for_type(rft); + + return error; +} diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp new file mode 100644 index 00000000..6000c970 --- /dev/null +++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp @@ -0,0 +1,81 @@ +// +// Copyright (c) 2021 The Khronos Group Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "procs.h" +#include "subhelpers.h" +#include "subgroup_common_templates.h" +#include "harness/conversions.h" +#include "harness/typeWrappers.h" + +namespace { + +static const char* shuffle_down_source = + "__kernel void test_sub_group_shuffle_down(const __global Type *in, " + "__global int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " out[gid] = sub_group_shuffle_down(x, xy[gid].z);" + "}\n"; +static const char* shuffle_up_source = + "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global " + "int4 *xy, __global Type *out)\n" + "{\n" + " int gid = get_global_id(0);\n" + " XY(xy,gid);\n" + " Type x = in[gid];\n" + " out[gid] = sub_group_shuffle_up(x, xy[gid].z);" + "}\n"; + +template int run_shuffle_relative_for_type(RunTestForType rft) +{ + int error = rft.run_impl>( + "test_sub_group_shuffle_up", shuffle_up_source); + error |= rft.run_impl>( + "test_sub_group_shuffle_down", shuffle_down_source); + return error; +} + +} + +int test_subgroup_functions_shuffle_relative(cl_device_id device, + cl_context context, + cl_command_queue queue, + int num_elements) +{ + std::vector required_extensions = { + "cl_khr_subgroup_shuffle_relative" + }; + constexpr size_t global_work_size = 2000; + constexpr size_t local_work_size = 200; + WorkGroupParams test_params(global_work_size, local_work_size, + required_extensions); + RunTestForType rft(device, context, queue, num_elements, test_params); + + int error = run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + error |= run_shuffle_relative_for_type(rft); + + return error; +} diff --git a/test_conformance/subgroups/test_workgroup.cpp b/test_conformance/subgroups/test_workgroup.cpp deleted file mode 100644 index 779d30f6..00000000 --- a/test_conformance/subgroups/test_workgroup.cpp +++ /dev/null @@ -1,727 +0,0 @@ -// -// Copyright (c) 2017 The Khronos Group Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -#include "procs.h" -#include "subhelpers.h" -#include "harness/conversions.h" -#include "harness/typeWrappers.h" - -static const char *any_source = "__kernel void test_any(const __global Type " - "*in, __global int2 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_any(in[gid]);\n" - "}\n"; - -static const char *all_source = "__kernel void test_all(const __global Type " - "*in, __global int2 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_all(in[gid]);\n" - "}\n"; - -static const char *bcast_source = - "__kernel void test_bcast(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " size_t loid = (size_t)((int)x % 100);\n" - " out[gid] = sub_group_broadcast(x, loid);\n" - "}\n"; - -static const char *redadd_source = - "__kernel void test_redadd(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_add(in[gid]);\n" - "}\n"; - -static const char *redmax_source = - "__kernel void test_redmax(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_max(in[gid]);\n" - "}\n"; - -static const char *redmin_source = - "__kernel void test_redmin(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_min(in[gid]);\n" - "}\n"; - -static const char *scinadd_source = - "__kernel void test_scinadd(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_add(in[gid]);\n" - "}\n"; - -static const char *scinmax_source = - "__kernel void test_scinmax(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_max(in[gid]);\n" - "}\n"; - -static const char *scinmin_source = - "__kernel void test_scinmin(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_min(in[gid]);\n" - "}\n"; - -static const char *scexadd_source = - "__kernel void test_scexadd(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_add(in[gid]);\n" - "}\n"; - -static const char *scexmax_source = - "__kernel void test_scexmax(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_max(in[gid]);\n" - "}\n"; - -static const char *scexmin_source = - "__kernel void test_scexmin(const __global Type *in, __global int2 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_min(in[gid]);\n" - "}\n"; - - -// Any/All test functions -template struct AA -{ - static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng) - { - int i, ii, j, k, n; - int nj = (nw + ns - 1) / ns; - int e; - - ii = 0; - for (k = 0; k < ng; ++k) - { - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - e = (int)(genrand_int32(gMTdata) % 3); - - // Initialize data matrix indexed by local id and sub group id - switch (e) - { - case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break; - case 1: - memset(&t[ii], 0, n * sizeof(cl_int)); - i = (int)(genrand_int32(gMTdata) % (cl_uint)n); - t[ii + i] = 41; - break; - case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break; - } - } - - // Now map into work group using map from device - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - x[j] = t[i]; - } - - x += nw; - m += 2 * nw; - } - } - - static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, - int ns, int nw, int ng) - { - int ii, i, j, k, n; - int nj = (nw + ns - 1) / ns; - cl_int taa, raa; - - log_info(" sub_group_%s...\n", Which == 0 ? "any" : "all"); - - for (k = 0; k < ng; ++k) - { - // Map to array indexed to array indexed by local ID and sub group - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - mx[i] = x[j]; - my[i] = y[j]; - } - - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - // Compute target - if (Which == 0) - { - taa = 0; - for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0; - } - else - { - taa = 1; - for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0; - } - - // Check result - for (i = 0; i < n; ++i) - { - raa = my[ii + i] != 0; - if (raa != taa) - { - log_error("ERROR: sub_group_%s mismatch for local id " - "%d in sub group %d in group %d\n", - Which == 0 ? "any" : "all", i, j, k); - return -1; - } - } - } - - x += nw; - y += nw; - m += 2 * nw; - } - - return 0; - } -}; - -// Reduce functions -template struct RED -{ - static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) - { - int i, ii, j, k, n; - int nj = (nw + ns - 1) / ns; - - ii = 0; - for (k = 0; k < ng; ++k) - { - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - for (i = 0; i < n; ++i) - t[ii + i] = (Ty)( - (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1); - } - - // Now map into work group using map from device - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - x[j] = t[i]; - } - - x += nw; - m += 2 * nw; - } - } - - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, - int ng) - { - int ii, i, j, k, n; - int nj = (nw + ns - 1) / ns; - Ty tr, rr; - - log_info(" sub_group_reduce_%s(%s)...\n", - Which == 0 ? "add" : (Which == 1 ? "max" : "min"), - TypeName::val()); - - for (k = 0; k < ng; ++k) - { - // Map to array indexed to array indexed by local ID and sub group - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - mx[i] = x[j]; - my[i] = y[j]; - } - - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - // Compute target - if (Which == 0) - { - // add - tr = mx[ii]; - for (i = 1; i < n; ++i) tr += mx[ii + i]; - } - else if (Which == 1) - { - // max - tr = mx[ii]; - for (i = 1; i < n; ++i) - tr = tr > mx[ii + i] ? tr : mx[ii + i]; - } - else if (Which == 2) - { - // min - tr = mx[ii]; - for (i = 1; i < n; ++i) - tr = tr > mx[ii + i] ? mx[ii + i] : tr; - } - - // Check result - for (i = 0; i < n; ++i) - { - rr = my[ii + i]; - if (rr != tr) - { - log_error("ERROR: sub_group_reduce_%s(%s) mismatch for " - "local id %d in sub group %d in group %d\n", - Which == 0 ? "add" - : (Which == 1 ? "max" : "min"), - TypeName::val(), i, j, k); - return -1; - } - } - } - - x += nw; - y += nw; - m += 2 * nw; - } - - return 0; - } -}; - -// Scan Inclusive functions -template struct SCIN -{ - static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) - { - int i, ii, j, k, n; - int nj = (nw + ns - 1) / ns; - - ii = 0; - for (k = 0; k < ng; ++k) - { - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - for (i = 0; i < n; ++i) - // t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) - // % ns + 1); - t[ii + i] = (Ty)i; - } - - // Now map into work group using map from device - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - x[j] = t[i]; - } - - x += nw; - m += 2 * nw; - } - } - - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, - int ng) - { - int ii, i, j, k, n; - int nj = (nw + ns - 1) / ns; - Ty tr, rr; - - log_info(" sub_group_scan_inclusive_%s(%s)...\n", - Which == 0 ? "add" : (Which == 1 ? "max" : "min"), - TypeName::val()); - - for (k = 0; k < ng; ++k) - { - // Map to array indexed to array indexed by local ID and sub group - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - mx[i] = x[j]; - my[i] = y[j]; - } - - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - // Check result - for (i = 0; i < n; ++i) - { - if (Which == 0) - { - tr = i == 0 ? mx[ii] : tr + mx[ii + i]; - } - else if (Which == 1) - { - tr = i == 0 ? mx[ii] - : (tr > mx[ii + i] ? tr : mx[ii + i]); - } - else - { - tr = i == 0 ? mx[ii] - : (tr > mx[ii + i] ? mx[ii + i] : tr); - } - - rr = my[ii + i]; - if (rr != tr) - { - log_error( - "ERROR: sub_group_scan_inclusive_%s(%s) mismatch " - "for local id %d in sub group %d in group %d\n", - Which == 0 ? "add" : (Which == 1 ? "max" : "min"), - TypeName::val(), i, j, k); - return -1; - } - } - } - - x += nw; - y += nw; - m += 2 * nw; - } - - return 0; - } -}; - -// Scan Exclusive functions -template struct SCEX -{ - static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) - { - int i, ii, j, k, n; - int nj = (nw + ns - 1) / ns; - - ii = 0; - for (k = 0; k < ng; ++k) - { - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - for (i = 0; i < n; ++i) - t[ii + i] = (Ty)( - (int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1); - } - - // Now map into work group using map from device - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - x[j] = t[i]; - } - - x += nw; - m += 2 * nw; - } - } - - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, - int ng) - { - int ii, i, j, k, n; - int nj = (nw + ns - 1) / ns; - Ty tr, trt, rr; - - log_info(" sub_group_scan_exclusive_%s(%s)...\n", - Which == 0 ? "add" : (Which == 1 ? "max" : "min"), - TypeName::val()); - - for (k = 0; k < ng; ++k) - { - // Map to array indexed to array indexed by local ID and sub group - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - mx[i] = x[j]; - my[i] = y[j]; - } - - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - - // Check result - for (i = 0; i < n; ++i) - { - if (Which == 0) - { - tr = i == 0 ? TypeIdentity::val() : tr + trt; - } - else if (Which == 1) - { - tr = i == 0 ? TypeIdentity::val() - : (trt > tr ? trt : tr); - } - else - { - tr = i == 0 ? TypeIdentity::val() - : (trt > tr ? tr : trt); - } - trt = mx[ii + i]; - rr = my[ii + i]; - - if (rr != tr) - { - log_error( - "ERROR: sub_group_scan_exclusive_%s(%s) mismatch " - "for local id %d in sub group %d in group %d\n", - Which == 0 ? "add" : (Which == 1 ? "max" : "min"), - TypeName::val(), i, j, k); - return -1; - } - } - } - - x += nw; - y += nw; - m += 2 * nw; - } - - return 0; - } -}; - -// Broadcast functios -template struct BC -{ - static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng) - { - int i, ii, j, k, l, n; - int nj = (nw + ns - 1) / ns; - int d = ns > 100 ? 100 : ns; - - ii = 0; - for (k = 0; k < ng; ++k) - { - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - l = (int)(genrand_int32(gMTdata) & 0x7fffffff) - % (d > n ? n : d); - - for (i = 0; i < n; ++i) - t[ii + i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff) - % 100 * 100 - + l); - } - - // Now map into work group using map from device - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - x[j] = t[i]; - } - - x += nw; - m += 2 * nw; - } - } - - static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw, - int ng) - { - int ii, i, j, k, l, n; - int nj = (nw + ns - 1) / ns; - Ty tr, rr; - - log_info(" sub_group_broadcast(%s)...\n", TypeName::val()); - - for (k = 0; k < ng; ++k) - { - // Map to array indexed to array indexed by local ID and sub group - for (j = 0; j < nw; ++j) - { - i = m[2 * j + 1] * ns + m[2 * j]; - mx[i] = x[j]; - my[i] = y[j]; - } - - for (j = 0; j < nj; ++j) - { - ii = j * ns; - n = ii + ns > nw ? nw - ii : ns; - l = (int)mx[ii] % 100; - tr = mx[ii + l]; - - // Check result - for (i = 0; i < n; ++i) - { - rr = my[ii + i]; - if (rr != tr) - { - log_error("ERROR: sub_group_broadcast(%s) mismatch for " - "local id %d in sub group %d in group %d\n", - TypeName::val(), i, j, k); - return -1; - } - } - } - - x += nw; - y += nw; - m += 2 * nw; - } - - return 0; - } -}; - -#define G 2000 -#define L 200 -struct run_for_type -{ - run_for_type(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, - bool useCoreSubgroups) - { - device_ = device; - context_ = context; - queue_ = queue; - num_elements_ = num_elements; - useCoreSubgroups_ = useCoreSubgroups; - } - - template cl_int run() - { - cl_int error; - error = test, G, L>::run(device_, context_, queue_, - num_elements_, "test_bcast", - bcast_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_redadd", - redadd_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_redmax", - redmax_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_redmin", - redmin_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_scinadd", - scinadd_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_scinmax", - scinmax_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_scinmin", - scinmin_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_scexadd", - scexadd_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_scexmax", - scexmax_source, 0, useCoreSubgroups_); - error |= test, G, L>::run( - device_, context_, queue_, num_elements_, "test_scexmin", - scexmin_source, 0, useCoreSubgroups_); - return error; - } - -private: - cl_device_id device_; - cl_context context_; - cl_command_queue queue_; - int num_elements_; - bool useCoreSubgroups_; -}; - -// Entry point from main -int test_work_group_functions(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, - bool useCoreSubgroups) -{ - int error; - error = test, G, L>::run(device, context, queue, num_elements, - "test_any", any_source, 0, - useCoreSubgroups); - error |= test, G, L>::run(device, context, queue, num_elements, - "test_all", all_source, 0, - useCoreSubgroups); - run_for_type rft(device, context, queue, num_elements, useCoreSubgroups); - error |= rft.run(); - error |= rft.run(); - error |= rft.run(); - error |= rft.run(); - error |= rft.run(); - error |= rft.run(); - // error |= rft.run(); - - return error; -} - -int test_work_group_functions_core(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements) -{ - return test_work_group_functions(device, context, queue, num_elements, - true); -} - -int test_work_group_functions_ext(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements) -{ - bool hasExtension = is_extension_available(device, "cl_khr_subgroups"); - - if (!hasExtension) - { - log_info( - "Device does not support 'cl_khr_subgroups'. Skipping the test.\n"); - return TEST_SKIPPED_ITSELF; - } - return test_work_group_functions(device, context, queue, num_elements, - false); -}