From 92844bead1afdf75b56085c2cda34be27458a582 Mon Sep 17 00:00:00 2001 From: Grzegorz Wawiorko Date: Fri, 1 Oct 2021 12:28:37 +0200 Subject: [PATCH] Extended subgroups - use 128bit masks (#1215) * Extended subgroups - use 128bit masks * Refactoring to avoid kernels code duplication * unification kernel names as test_ prefix +subgroups function name * use string literals that improve readability * use kernel templates that limit code duplication * WorkGroupParams allows define default kernel - kernel template for multiple functions * WorkGroupParams allows define kernel for specific one subgroup function Co-authored-by: Stuart Brady --- .../subgroups/subgroup_common_kernels.cpp | 102 +---- .../subgroups/subgroup_common_kernels.h | 12 +- .../subgroups/subgroup_common_templates.h | 98 ++-- test_conformance/subgroups/subhelpers.h | 181 +++++++- test_conformance/subgroups/test_subgroup.cpp | 47 +- .../subgroups/test_subgroup_ballot.cpp | 425 +++++++----------- .../test_subgroup_clustered_reduce.cpp | 176 +------- .../test_subgroup_extended_types.cpp | 44 +- .../test_subgroup_non_uniform_arithmetic.cpp | 409 +++-------------- .../test_subgroup_non_uniform_vote.cpp | 95 ++-- .../subgroups/test_subgroup_shuffle.cpp | 29 +- .../test_subgroup_shuffle_relative.cpp | 28 +- 12 files changed, 592 insertions(+), 1054 deletions(-) diff --git a/test_conformance/subgroups/subgroup_common_kernels.cpp b/test_conformance/subgroups/subgroup_common_kernels.cpp index f8b24450..33a51637 100644 --- a/test_conformance/subgroups/subgroup_common_kernels.cpp +++ b/test_conformance/subgroups/subgroup_common_kernels.cpp @@ -15,92 +15,20 @@ // #include "subgroup_common_kernels.h" -const char* bcast_source = - "__kernel void test_bcast(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint which_sub_group_local_id = xy[gid].z;\n" - " out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n" - "}\n"; +std::string sub_group_reduction_scan_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + out[gid] = %s(in[gid]); + } +)"; -const char* redadd_source = "__kernel void test_redadd(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_add(in[gid]);\n" - "}\n"; - -const char* redmax_source = "__kernel void test_redmax(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_max(in[gid]);\n" - "}\n"; - -const char* redmin_source = "__kernel void test_redmin(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_reduce_min(in[gid]);\n" - "}\n"; - -const char* scinadd_source = - "__kernel void test_scinadd(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_add(in[gid]);\n" - "}\n"; - -const char* scinmax_source = - "__kernel void test_scinmax(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_max(in[gid]);\n" - "}\n"; - -const char* scinmin_source = - "__kernel void test_scinmin(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_inclusive_min(in[gid]);\n" - "}\n"; - -const char* scexadd_source = - "__kernel void test_scexadd(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_add(in[gid]);\n" - "}\n"; - -const char* scexmax_source = - "__kernel void test_scexmax(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_max(in[gid]);\n" - "}\n"; - -const char* scexmin_source = - "__kernel void test_scexmin(const __global Type *in, __global int4 *xy, " - "__global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " out[gid] = sub_group_scan_exclusive_min(in[gid]);\n" - "}\n"; +std::string sub_group_generic_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + out[gid] = %s(x, xy[gid].z); + } +)"; \ No newline at end of file diff --git a/test_conformance/subgroups/subgroup_common_kernels.h b/test_conformance/subgroups/subgroup_common_kernels.h index 8ae97d9a..bf2210ef 100644 --- a/test_conformance/subgroups/subgroup_common_kernels.h +++ b/test_conformance/subgroups/subgroup_common_kernels.h @@ -18,15 +18,7 @@ #include "subhelpers.h" -extern const char* bcast_source; -extern const char* redadd_source; -extern const char* redmax_source; -extern const char* redmin_source; -extern const char* scinadd_source; -extern const char* scinmax_source; -extern const char* scinmin_source; -extern const char* scexadd_source; -extern const char* scexmax_source; -extern const char* scexmin_source; +extern std::string sub_group_reduction_scan_source; +extern std::string sub_group_generic_source; #endif diff --git a/test_conformance/subgroups/subgroup_common_templates.h b/test_conformance/subgroups/subgroup_common_templates.h index 4333e95b..5c5f9560 100644 --- a/test_conformance/subgroups/subgroup_common_templates.h +++ b/test_conformance/subgroups/subgroup_common_templates.h @@ -17,13 +17,10 @@ #define SUBGROUPCOMMONTEMPLATES_H #include "typeWrappers.h" -#include #include "CL/cl_half.h" #include "subhelpers.h" - #include -typedef std::bitset<128> bs128; static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id, const std::string &mask_type, cl_uint max_sub_group_size) @@ -577,16 +574,21 @@ template struct SCEX_NU int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" - : func_name = "sub_group_scan_exclusive"; + test_params.work_items_mask.any() + ? func_name = "sub_group_non_uniform_scan_exclusive" + : func_name = "sub_group_scan_exclusive"; log_info(" %s_%s(%s)...\n", func_name.c_str(), operation_names(operation), TypeManager::name()); log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x \n", - test_params.global_workgroup_size, nw, ns, work_items_mask); + "size = %d \n", + test_params.global_workgroup_size, nw, ns); + if (test_params.work_items_mask.any()) + { + log_info(" work items mask: %s\n", + test_params.work_items_mask.to_string().c_str()); + } genrand(x, t, m, ns, nw, ng); } @@ -597,18 +599,22 @@ template struct SCEX_NU int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; + bs128 work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" - : func_name = "sub_group_scan_exclusive"; + test_params.work_items_mask.any() + ? func_name = "sub_group_non_uniform_scan_exclusive" + : func_name = "sub_group_scan_exclusive"; + - uint32_t use_work_items_mask; // for uniform case take into consideration all workitems - use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; + if (!work_items_mask.any()) + { + work_items_mask.set(); + } for (k = 0; k < ng; ++k) { // for each work_group // Map to array indexed to array indexed by local ID and sub group @@ -624,8 +630,7 @@ template struct SCEX_NU std::set active_work_items; for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (use_work_items_mask & check_work_item) + if (work_items_mask.test(i)) { active_work_items.insert(i); } @@ -688,18 +693,23 @@ template struct SCIN_NU int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" - : func_name = "sub_group_scan_inclusive"; + test_params.work_items_mask.any() + ? func_name = "sub_group_non_uniform_scan_inclusive" + : func_name = "sub_group_scan_inclusive"; genrand(x, t, m, ns, nw, ng); log_info(" %s_%s(%s)...\n", func_name.c_str(), operation_names(operation), TypeManager::name()); log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x \n", - test_params.global_workgroup_size, nw, ns, work_items_mask); + "size = %d \n", + test_params.global_workgroup_size, nw, ns); + if (test_params.work_items_mask.any()) + { + log_info(" work items mask: %s\n", + test_params.work_items_mask.to_string().c_str()); + } } static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, @@ -709,18 +719,22 @@ template struct SCIN_NU int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; + bs128 work_items_mask = test_params.work_items_mask; + int nj = (nw + ns - 1) / ns; Ty tr, rr; ng = ng / nw; std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" - : func_name = "sub_group_scan_inclusive"; + work_items_mask.any() + ? func_name = "sub_group_non_uniform_scan_inclusive" + : func_name = "sub_group_scan_inclusive"; - uint32_t use_work_items_mask; // for uniform case take into consideration all workitems - use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; + if (!work_items_mask.any()) + { + work_items_mask.set(); + } // std::bitset<32> mask32(use_work_items_mask); // for (int k) mask32.count(); for (k = 0; k < ng; ++k) @@ -740,8 +754,7 @@ template struct SCIN_NU for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (use_work_items_mask & check_work_item) + if (work_items_mask.test(i)) { if (catch_frist_active == -1) { @@ -807,17 +820,22 @@ template struct RED_NU int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; ng = ng / nw; std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_reduce" - : func_name = "sub_group_reduce"; + test_params.work_items_mask.any() + ? func_name = "sub_group_non_uniform_reduce" + : func_name = "sub_group_reduce"; log_info(" %s_%s(%s)...\n", func_name.c_str(), operation_names(operation), TypeManager::name()); log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x \n", - test_params.global_workgroup_size, nw, ns, work_items_mask); + "size = %d \n", + test_params.global_workgroup_size, nw, ns); + if (test_params.work_items_mask.any()) + { + log_info(" work items mask: %s\n", + test_params.work_items_mask.to_string().c_str()); + } genrand(x, t, m, ns, nw, ng); } @@ -828,14 +846,14 @@ template struct RED_NU int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; + bs128 work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; ng = ng / nw; Ty tr, rr; std::string func_name; - work_items_mask ? func_name = "sub_group_non_uniform_reduce" - : func_name = "sub_group_reduce"; + work_items_mask.any() ? func_name = "sub_group_non_uniform_reduce" + : func_name = "sub_group_reduce"; for (k = 0; k < ng; ++k) { @@ -847,9 +865,10 @@ template struct RED_NU my[j] = y[j]; } - uint32_t use_work_items_mask; - use_work_items_mask = - !work_items_mask ? 0xFFFFFFFF : work_items_mask; + if (!work_items_mask.any()) + { + work_items_mask.set(); + } for (j = 0; j < nj; ++j) { @@ -859,8 +878,7 @@ template struct RED_NU int catch_frist_active = -1; for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (use_work_items_mask & check_work_item) + if (work_items_mask.test(i)) { if (catch_frist_active == -1) { diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h index 0d497fb3..6d32928a 100644 --- a/test_conformance/subgroups/subhelpers.h +++ b/test_conformance/subgroups/subhelpers.h @@ -24,31 +24,172 @@ #include #include #include +#include +#include +#include #define NR_OF_ACTIVE_WORK_ITEMS 4 extern MTdata gMTdata; +typedef std::bitset<128> bs128; extern cl_half_rounding_mode g_rounding_mode; struct WorkGroupParams { WorkGroupParams(size_t gws, size_t lws, - const std::vector &all_wim = {}) + bool use_mask = false) : global_workgroup_size(gws), local_workgroup_size(lws), - all_work_item_masks(all_wim) + use_masks(use_mask) { subgroup_size = 0; work_items_mask = 0; use_core_subgroups = true; dynsc = 0; + load_masks(); } size_t global_workgroup_size; size_t local_workgroup_size; size_t subgroup_size; - uint32_t work_items_mask; + bs128 work_items_mask; int dynsc; bool use_core_subgroups; - std::vector all_work_item_masks; + std::vector all_work_item_masks; + bool use_masks; + void save_kernel_source(const std::string &source, std::string name = "") + { + if (name == "") + { + name = "default"; + } + if (kernel_function_name.find(name) != kernel_function_name.end()) + { + log_info("Kernel definition duplication. Source will be " + "overwritten for function name %s", + name.c_str()); + } + kernel_function_name[name] = source; + }; + // return specific defined kernel or default. + std::string get_kernel_source(std::string name) + { + if (kernel_function_name.find(name) == kernel_function_name.end()) + { + return kernel_function_name["default"]; + } + return kernel_function_name[name]; + } + + +private: + std::map kernel_function_name; + void load_masks() + { + if (use_masks) + { + // 1 in string will be set 1, 0 will be set 0 + bs128 mask_0xf0f0f0f0("11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xf0f0f0f0); + // 1 in string will be set 0, 0 will be set 1 + bs128 mask_0x0f0f0f0f("11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000" + "11110000111100001111000011110000", + 128, '1', '0'); + all_work_item_masks.push_back(mask_0x0f0f0f0f); + bs128 mask_0x5555aaaa("10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x5555aaaa); + bs128 mask_0xaaaa5555("10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010" + "10101010101010101010101010101010", + 128, '1', '0'); + all_work_item_masks.push_back(mask_0xaaaa5555); + // 0x0f0ff0f0 + bs128 mask_0x0f0ff0f0("00001111000011111111000011110000" + "00001111000011111111000011110000" + "00001111000011111111000011110000" + "00001111000011111111000011110000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x0f0ff0f0); + // 0xff0000ff + bs128 mask_0xff0000ff("11111111000000000000000011111111" + "11111111000000000000000011111111" + "11111111000000000000000011111111" + "11111111000000000000000011111111", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xff0000ff); + // 0xff00ff00 + bs128 mask_0xff00ff00("11111111000000001111111100000000" + "11111111000000001111111100000000" + "11111111000000001111111100000000" + "11111111000000001111111100000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xff00ff00); + // 0x00ffff00 + bs128 mask_0x00ffff00("00000000111111111111111100000000" + "00000000111111111111111100000000" + "00000000111111111111111100000000" + "00000000111111111111111100000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x00ffff00); + // 0x80 1 workitem highest id for 8 subgroup size + bs128 mask_0x80808080("10000000100000001000000010000000" + "10000000100000001000000010000000" + "10000000100000001000000010000000" + "10000000100000001000000010000000", + 128, '0', '1'); + + all_work_item_masks.push_back(mask_0x80808080); + // 0x8000 1 workitem highest id for 16 subgroup size + bs128 mask_0x80008000("10000000000000001000000000000000" + "10000000000000001000000000000000" + "10000000000000001000000000000000" + "10000000000000001000000000000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x80008000); + // 0x80000000 1 workitem highest id for 32 subgroup size + bs128 mask_0x80000000("10000000000000000000000000000000" + "10000000000000000000000000000000" + "10000000000000000000000000000000" + "10000000000000000000000000000000", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0x80000000); + // 0x80000000 00000000 1 workitem highest id for 64 subgroup size + // 0x80000000 1 workitem highest id for 32 subgroup size + bs128 mask_0x8000000000000000("10000000000000000000000000000000" + "00000000000000000000000000000000" + "10000000000000000000000000000000" + "00000000000000000000000000000000", + 128, '0', '1'); + + all_work_item_masks.push_back(mask_0x8000000000000000); + // 0x80000000 00000000 00000000 00000000 1 workitem highest id for + // 128 subgroup size + bs128 mask_0x80000000000000000000000000000000( + "10000000000000000000000000000000" + "00000000000000000000000000000000" + "00000000000000000000000000000000" + "00000000000000000000000000000000", + 128, '0', '1'); + all_work_item_masks.push_back( + mask_0x80000000000000000000000000000000); + + bs128 mask_0xffffffff("11111111111111111111111111111111" + "11111111111111111111111111111111" + "11111111111111111111111111111111" + "11111111111111111111111111111111", + 128, '0', '1'); + all_work_item_masks.push_back(mask_0xffffffff); + } + } }; enum class SubgroupsBroadcastOp @@ -1267,11 +1408,23 @@ template struct test std::vector mapout; mapout.resize(local); std::stringstream kernel_sstr; - if (test_params.work_items_mask != 0) + if (test_params.use_masks) { - kernel_sstr << "#define WORK_ITEMS_MASK "; - kernel_sstr << "0x" << std::hex << test_params.work_items_mask - << "\n"; + // Prapare uint4 type to store bitmask on kernel OpenCL C side + // To keep order the first characet in string is the lowest bit + // there was a need to give such offset to bitset constructor + // (first highest offset = 96) + std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(), + 96, 32); + std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(), + 64, 32); + std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(), + 32, 32); + std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(), + 0, 32); + kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b" + << bits_1_32 << ",0b" << bits_33_64 << ",0b" + << bits_65_96 << ",0b" << bits_97_128 << ");\n"; } @@ -1452,18 +1605,24 @@ struct RunTestForType num_elements_(num_elements), test_params_(test_params) {} template - int run_impl(const char *kernel_name, const char *source) + int run_impl(const std::string &function_name) { int error = TEST_PASS; + std::string source = + std::regex_replace(test_params_.get_kernel_source(function_name), + std::regex("\\%s"), function_name); + std::string kernel_name = "test_" + function_name; if (test_params_.all_work_item_masks.size() > 0) { error = test::mrun(device_, context_, queue_, num_elements_, - kernel_name, source, test_params_); + kernel_name.c_str(), source.c_str(), + test_params_); } else { error = test::run(device_, context_, queue_, num_elements_, - kernel_name, source, test_params_); + kernel_name.c_str(), source.c_str(), + test_params_); } return error; diff --git a/test_conformance/subgroups/test_subgroup.cpp b/test_conformance/subgroups/test_subgroup.cpp index c0e49524..63bfc453 100644 --- a/test_conformance/subgroups/test_subgroup.cpp +++ b/test_conformance/subgroups/test_subgroup.cpp @@ -150,25 +150,25 @@ template int run_broadcast_scan_reduction_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_bcast", bcast_source); - error |= rft.run_impl>("test_redadd", - redadd_source); - error |= rft.run_impl>("test_redmax", - redmax_source); - error |= rft.run_impl>("test_redmin", - redmin_source); - error |= rft.run_impl>("test_scinadd", - scinadd_source); - error |= rft.run_impl>("test_scinmax", - scinmax_source); - error |= rft.run_impl>("test_scinmin", - scinmin_source); - error |= rft.run_impl>("test_scexadd", - scexadd_source); - error |= rft.run_impl>("test_scexmax", - scexmax_source); - error |= rft.run_impl>("test_scexmin", - scexmin_source); + "sub_group_broadcast"); + error |= + rft.run_impl>("sub_group_reduce_add"); + error |= + rft.run_impl>("sub_group_reduce_max"); + error |= + rft.run_impl>("sub_group_reduce_min"); + error |= rft.run_impl>( + "sub_group_scan_inclusive_add"); + error |= rft.run_impl>( + "sub_group_scan_inclusive_max"); + error |= rft.run_impl>( + "sub_group_scan_inclusive_min"); + error |= rft.run_impl>( + "sub_group_scan_exclusive_add"); + error |= rft.run_impl>( + "sub_group_scan_exclusive_max"); + error |= rft.run_impl>( + "sub_group_scan_exclusive_min"); return error; } @@ -181,11 +181,14 @@ int test_subgroup_functions(cl_device_id device, cl_context context, constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_reduction_scan_source); + test_params.save_kernel_source(sub_group_generic_source, + "sub_group_broadcast"); + RunTestForType rft(device, context, queue, num_elements, test_params); int error = - rft.run_impl>("test_any", any_source); - error |= - rft.run_impl>("test_all", all_source); + rft.run_impl>("sub_group_any"); + error |= rft.run_impl>("sub_group_all"); error |= run_broadcast_scan_reduction_for_type(rft); error |= run_broadcast_scan_reduction_for_type(rft); error |= run_broadcast_scan_reduction_for_type(rft); diff --git a/test_conformance/subgroups/test_subgroup_ballot.cpp b/test_conformance/subgroups/test_subgroup_ballot.cpp index 9a2da5d9..2bd54e43 100644 --- a/test_conformance/subgroups/test_subgroup_ballot.cpp +++ b/test_conformance/subgroups/test_subgroup_ballot.cpp @@ -684,239 +684,127 @@ template struct SMASK } }; -static const char *bcast_non_uniform_source = - "__kernel void test_bcast_non_uniform(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" - " out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n" - " } else {\n" - " out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n" - " }\n" - "}\n"; +std::string sub_group_non_uniform_broadcast_source = R"( +__kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) { + out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z); + } else { + out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w); + } +} +)"; +std::string sub_group_broadcast_first_source = R"( +__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) { + out[gid] = sub_group_broadcast_first(x);; + } else { + out[gid] = sub_group_broadcast_first(x);; + } +} +)"; +std::string sub_group_ballot_bit_scan_find_source = R"( +__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + uint4 value = (uint4)(0,0,0,0); + value = (uint4)(%s(x),0,0,0); + out[gid] = value; +} +)"; +std::string sub_group_ballot_mask_source = R"( +__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + xy[gid].z = get_max_sub_group_size(); + Type x = in[gid]; + uint4 mask = %s(); + out[gid] = mask; +} +)"; +std::string sub_group_ballot_source = R"( +__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out) { + uint4 full_ballot = sub_group_ballot(1); + uint divergence_mask; + uint4 partial_ballot; + uint gid = get_global_id(0); + XY(xy,gid); + if (get_sub_group_local_id() & 1) { + divergence_mask = 0xaaaaaaaa; + partial_ballot = sub_group_ballot(1); + } else { + divergence_mask = 0x55555555; + partial_ballot = sub_group_ballot(1); + } + size_t lws = get_local_size(0); + uint4 masked_ballot = full_ballot; + masked_ballot.x &= divergence_mask; + masked_ballot.y &= divergence_mask; + masked_ballot.z &= divergence_mask; + masked_ballot.w &= divergence_mask; + out[gid] = all(masked_ballot == partial_ballot); -static const char *bcast_first_source = - "__kernel void test_bcast_first(const __global Type *in, __global int4 " - "*xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" - " out[gid] = sub_group_broadcast_first(x);\n" - " } else {\n" - " out[gid] = sub_group_broadcast_first(x);\n" - " }\n" - "}\n"; - -static const char *ballot_bit_count_source = - "__kernel void test_sub_group_ballot_bit_count(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_inclusive_scan_source = - "__kernel void test_sub_group_ballot_inclusive_scan(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_exclusive_scan_source = - "__kernel void test_sub_group_ballot_exclusive_scan(const __global Type " - "*in, __global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_find_lsb_source = - "__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);\n" - " value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_find_msb_source = - "__kernel void test_sub_group_ballot_find_msb(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(0,0,0,0);" - " value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);" - " out[gid] = value ;" - "}\n"; - -static const char *get_subgroup_ge_mask_source = - "__kernel void test_get_sub_group_ge_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_ge_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_gt_mask_source = - "__kernel void test_get_sub_group_gt_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_gt_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_le_mask_source = - "__kernel void test_get_sub_group_le_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_le_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_lt_mask_source = - "__kernel void test_get_sub_group_lt_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_lt_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *get_subgroup_eq_mask_source = - "__kernel void test_get_sub_group_eq_mask(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].z = get_max_sub_group_size();\n" - " Type x = in[gid];\n" - " uint4 mask = get_sub_group_eq_mask();" - " out[gid] = mask;\n" - "}\n"; - -static const char *ballot_source = - "__kernel void test_sub_group_ballot(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - "uint4 full_ballot = sub_group_ballot(1);\n" - "uint divergence_mask;\n" - "uint4 partial_ballot;\n" - "uint gid = get_global_id(0);" - "XY(xy,gid);\n" - "if (get_sub_group_local_id() & 1) {\n" - " divergence_mask = 0xaaaaaaaa;\n" - " partial_ballot = sub_group_ballot(1);\n" - "} else {\n" - " divergence_mask = 0x55555555;\n" - " partial_ballot = sub_group_ballot(1);\n" - "}\n" - " size_t lws = get_local_size(0);\n" - "uint4 masked_ballot = full_ballot;\n" - "masked_ballot.x &= divergence_mask;\n" - "masked_ballot.y &= divergence_mask;\n" - "masked_ballot.z &= divergence_mask;\n" - "masked_ballot.w &= divergence_mask;\n" - "out[gid] = all(masked_ballot == partial_ballot);\n" - - "} \n"; - -static const char *ballot_source_inverse = - "__kernel void test_sub_group_ballot_inverse(const __global " - "Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint4 value = (uint4)(10,0,0,0);\n" - " if (get_sub_group_local_id() & 1) {" - " uint4 partial_ballot_mask = " - "(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);" - " if (sub_group_inverse_ballot(partial_ballot_mask)) {\n" - " value = (uint4)(1,0,0,1);\n" - " } else {\n" - " value = (uint4)(0,0,0,1);\n" - " }\n" - " } else {\n" - " uint4 partial_ballot_mask = " - "(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);" - " if (sub_group_inverse_ballot(partial_ballot_mask)) {\n" - " value = (uint4)(1,0,0,2);\n" - " } else {\n" - " value = (uint4)(0,0,0,2);\n" - " }\n" - " }\n" - " out[gid] = value;\n" - "}\n"; - -static const char *ballot_bit_extract_source = - "__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " uint index = xy[gid].z;\n" - " uint4 value = (uint4)(10,0,0,0);\n" - " if (get_sub_group_local_id() & 1) {" - " if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n" - " value = (uint4)(1,0,0,1);\n" - " } else {\n" - " value = (uint4)(0,0,0,1);\n" - " }\n" - " } else {\n" - " if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n" - " value = (uint4)(1,0,0,2);\n" - " } else {\n" - " value = (uint4)(0,0,0,2);\n" - " }\n" - " }\n" - " out[gid] = value;\n" - "}\n"; +} +)"; +std::string sub_group_inverse_ballot_source = R"( +__kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + uint4 value = (uint4)(10,0,0,0); + if (get_sub_group_local_id() & 1) { + uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA); + if (sub_group_inverse_ballot(partial_ballot_mask)) { + value = (uint4)(1,0,0,1); + } else { + value = (uint4)(0,0,0,1); + } + } else { + uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555); + if (sub_group_inverse_ballot(partial_ballot_mask)) { + value = (uint4)(1,0,0,2); + } else { + value = (uint4)(0,0,0,2); + } + } + out[gid] = value; +} +)"; +std::string sub_group_ballot_bit_extract_source = R"( + __kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + Type x = in[gid]; + uint index = xy[gid].z; + uint4 value = (uint4)(10,0,0,0); + if (get_sub_group_local_id() & 1) { + if (sub_group_ballot_bit_extract(x, xy[gid].z)) { + value = (uint4)(1,0,0,1); + } else { + value = (uint4)(0,0,0,1); + } + } else { + if (sub_group_ballot_bit_extract(x, xy[gid].w)) { + value = (uint4)(1,0,0,2); + } else { + value = (uint4)(0,0,0,2); + } + } + out[gid] = value; +} +)"; template int run_non_uniform_broadcast_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_bcast_non_uniform", bcast_non_uniform_source); + "sub_group_non_uniform_broadcast"); return error; } @@ -932,9 +820,15 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context, "skipping test.\n"); return TEST_SKIPPED_ITSELF; } + constexpr size_t global_work_size = 170; constexpr size_t local_work_size = 64; WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_ballot_mask_source); + test_params.save_kernel_source(sub_group_non_uniform_broadcast_source, + "sub_group_non_uniform_broadcast"); + test_params.save_kernel_source(sub_group_broadcast_first_source, + "sub_group_broadcast_first"); RunTestForType rft(device, context, queue, num_elements, test_params); // non uniform broadcast functions @@ -1018,76 +912,87 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context, // broadcast first functions error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); error |= rft.run_impl< subgroups::cl_half, BC>( - "test_bcast_first", bcast_first_source); + "sub_group_broadcast_first"); // mask functions error |= rft.run_impl>( - "test_get_sub_group_eq_mask", get_subgroup_eq_mask_source); + "get_sub_group_eq_mask"); error |= rft.run_impl>( - "test_get_sub_group_ge_mask", get_subgroup_ge_mask_source); + "get_sub_group_ge_mask"); error |= rft.run_impl>( - "test_get_sub_group_gt_mask", get_subgroup_gt_mask_source); + "get_sub_group_gt_mask"); error |= rft.run_impl>( - "test_get_sub_group_le_mask", get_subgroup_le_mask_source); + "get_sub_group_le_mask"); error |= rft.run_impl>( - "test_get_sub_group_lt_mask", get_subgroup_lt_mask_source); + "get_sub_group_lt_mask"); // ballot functions - error |= rft.run_impl>("test_sub_group_ballot", - ballot_source); - error |= rft.run_impl>( - "test_sub_group_ballot_inverse", ballot_source_inverse); - error |= rft.run_impl< + WorkGroupParams test_params_ballot(global_work_size, local_work_size); + test_params_ballot.save_kernel_source( + sub_group_ballot_bit_scan_find_source); + test_params_ballot.save_kernel_source(sub_group_ballot_source, + "sub_group_ballot"); + test_params_ballot.save_kernel_source(sub_group_inverse_ballot_source, + "sub_group_inverse_ballot"); + test_params_ballot.save_kernel_source(sub_group_ballot_bit_extract_source, + "sub_group_ballot_bit_extract"); + RunTestForType rft_ballot(device, context, queue, num_elements, + test_params_ballot); + error |= rft_ballot.run_impl>("sub_group_ballot"); + error |= + rft_ballot.run_impl>( + "sub_group_inverse_ballot"); + error |= rft_ballot.run_impl< cl_uint4, BALLOT_BIT_EXTRACT>( - "test_sub_group_ballot_bit_extract", ballot_bit_extract_source); - error |= rft.run_impl< + "sub_group_ballot_bit_extract"); + error |= rft_ballot.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND>( - "test_sub_group_ballot_bit_count", ballot_bit_count_source); - error |= rft.run_impl< + "sub_group_ballot_bit_count"); + error |= rft_ballot.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND>( - "test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source); - error |= rft.run_impl< + "sub_group_ballot_inclusive_scan"); + error |= rft_ballot.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND>( - "test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source); - error |= rft.run_impl< + "sub_group_ballot_exclusive_scan"); + error |= rft_ballot.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND>( - "test_sub_group_ballot_find_lsb", ballot_find_lsb_source); - error |= rft.run_impl< + "sub_group_ballot_find_lsb"); + error |= rft_ballot.run_impl< cl_uint4, BALLOT_COUNT_SCAN_FIND>( - "test_sub_group_ballot_find_msb", ballot_find_msb_source); + "sub_group_ballot_find_msb"); return error; } diff --git a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp index 87507e37..11fcebc4 100644 --- a/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp +++ b/test_conformance/subgroups/test_subgroup_clustered_reduce.cpp @@ -22,149 +22,17 @@ #define CLUSTER_SIZE_STR "4" namespace { -static const char *redadd_clustered_source = - "__kernel void test_redadd_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redmax_clustered_source = - "__kernel void test_redmax_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redmin_clustered_source = - "__kernel void test_redmin_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redmul_clustered_source = - "__kernel void test_redmul_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redand_clustered_source = - "__kernel void test_redand_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redor_clustered_source = - "__kernel void test_redor_clustered(const __global Type *in, __global int4 " - "*xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redxor_clustered_source = - "__kernel void test_redxor_clustered(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR - ");\n" - "}\n"; - -static const char *redand_clustered_logical_source = - "__kernel void test_redand_clustered_logical(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR - ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = " - "sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n" - "}\n"; - -static const char *redor_clustered_logical_source = - "__kernel void test_redor_clustered_logical(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if (sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR - ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = " - "sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n" - "}\n"; - -static const char *redxor_clustered_logical_source = - "__kernel void test_redxor_clustered_logical(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " xy[gid].w = 0;\n" - " if ( sizeof(in[gid]) == " - "sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR - ")))\n" - " {xy[gid].w = sizeof(in[gid]);}\n" - " out[gid] = " - "sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n" - "}\n"; - +std::string sub_group_clustered_reduce_source = R"( +__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + int gid = get_global_id(0); + XY(xy,gid); + xy[gid].w = 0; + if (sizeof(in[gid]) == sizeof(%s(in[gid], )" CLUSTER_SIZE_STR R"())) { + xy[gid].w = sizeof(in[gid]); + } + out[gid] = %s(in[gid], )" CLUSTER_SIZE_STR R"(); +} +)"; // DESCRIPTION: // Test for reduce cluster functions @@ -267,34 +135,34 @@ template int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_redadd_clustered", redadd_clustered_source); + "sub_group_clustered_reduce_add"); error |= rft.run_impl>( - "test_redmax_clustered", redmax_clustered_source); + "sub_group_clustered_reduce_max"); error |= rft.run_impl>( - "test_redmin_clustered", redmin_clustered_source); + "sub_group_clustered_reduce_min"); error |= rft.run_impl>( - "test_redmul_clustered", redmul_clustered_source); + "sub_group_clustered_reduce_mul"); return error; } template int run_cluster_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_redand_clustered", redand_clustered_source); + "sub_group_clustered_reduce_and"); error |= rft.run_impl>( - "test_redor_clustered", redor_clustered_source); + "sub_group_clustered_reduce_or"); error |= rft.run_impl>( - "test_redxor_clustered", redxor_clustered_source); + "sub_group_clustered_reduce_xor"); return error; } template int run_cluster_logical_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_redand_clustered_logical", redand_clustered_logical_source); + "sub_group_clustered_reduce_logical_and"); error |= rft.run_impl>( - "test_redor_clustered_logical", redor_clustered_logical_source); + "sub_group_clustered_reduce_logical_or"); error |= rft.run_impl>( - "test_redxor_clustered_logical", redxor_clustered_logical_source); + "sub_group_clustered_reduce_logical_xor"); return error; } @@ -311,9 +179,11 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device, "device, skipping test.\n"); return TEST_SKIPPED_ITSELF; } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_clustered_reduce_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_cluster_red_add_max_min_mul_for_type(rft); diff --git a/test_conformance/subgroups/test_subgroup_extended_types.cpp b/test_conformance/subgroups/test_subgroup_extended_types.cpp index b281f618..dbe24623 100644 --- a/test_conformance/subgroups/test_subgroup_extended_types.cpp +++ b/test_conformance/subgroups/test_subgroup_extended_types.cpp @@ -24,30 +24,30 @@ namespace { template int run_broadcast_for_extended_type(RunTestForType rft) { int error = rft.run_impl>( - "test_bcast", bcast_source); + "sub_group_broadcast"); return error; } template int run_scan_reduction_for_type(RunTestForType rft) { - int error = rft.run_impl>("test_redadd", - redadd_source); - error |= rft.run_impl>("test_redmax", - redmax_source); - error |= rft.run_impl>("test_redmin", - redmin_source); - error |= rft.run_impl>("test_scinadd", - scinadd_source); - error |= rft.run_impl>("test_scinmax", - scinmax_source); - error |= rft.run_impl>("test_scinmin", - scinmin_source); - error |= rft.run_impl>("test_scexadd", - scexadd_source); - error |= rft.run_impl>("test_scexmax", - scexmax_source); - error |= rft.run_impl>("test_scexmin", - scexmin_source); + int error = + rft.run_impl>("sub_group_reduce_add"); + error |= + rft.run_impl>("sub_group_reduce_max"); + error |= + rft.run_impl>("sub_group_reduce_min"); + error |= rft.run_impl>( + "sub_group_scan_inclusive_add"); + error |= rft.run_impl>( + "sub_group_scan_inclusive_max"); + error |= rft.run_impl>( + "sub_group_scan_inclusive_min"); + error |= rft.run_impl>( + "sub_group_scan_exclusive_add"); + error |= rft.run_impl>( + "sub_group_scan_exclusive_max"); + error |= rft.run_impl>( + "sub_group_scan_exclusive_min"); return error; } @@ -65,11 +65,15 @@ int test_subgroup_functions_extended_types(cl_device_id device, "device, skipping test.\n"); return TEST_SKIPPED_ITSELF; } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); - RunTestForType rft(device, context, queue, num_elements, test_params); + test_params.save_kernel_source(sub_group_reduction_scan_source); + test_params.save_kernel_source(sub_group_generic_source, + "sub_group_broadcast"); + RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_broadcast_for_extended_type(rft); error |= run_broadcast_for_extended_type(rft); error |= run_broadcast_for_extended_type(rft); diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp index 6c44249e..bb257bcd 100644 --- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp +++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp @@ -17,336 +17,29 @@ #include "subhelpers.h" #include "harness/typeWrappers.h" #include "subgroup_common_templates.h" +#include namespace { -static const char *scinadd_non_uniform_source = R"( - __kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { +std::string sub_group_non_uniform_arithmetic_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { int gid = get_global_id(0); XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]); - } - } -)"; - -static const char *scinmax_non_uniform_source = R"( - __kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]); - } - } -)"; - -static const char *scinmin_non_uniform_source = R"( - __kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]); - } - } -)"; - -static const char *scinmul_non_uniform_source = R"( - __kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]); - } - } -)"; - -static const char *scinand_non_uniform_source = R"( - __kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]); - } - } -)"; - -static const char *scinor_non_uniform_source = R"( - __kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]); - } - } -)"; - -static const char *scinxor_non_uniform_source = R"( - __kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]); - } - } -)"; - -static const char *scinand_non_uniform_logical_source = R"( - __kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]); - } - } -)"; - -static const char *scinor_non_uniform_logical_source = R"( - __kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]); - } - } -)"; - -static const char *scinxor_non_uniform_logical_source = R"( - __kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]); - } - } -)"; - -static const char *scexadd_non_uniform_source = R"( - __kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]); - } - } -)"; - -static const char *scexmax_non_uniform_source = R"( - __kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]); - } - } -)"; - -static const char *scexmin_non_uniform_source = R"( - __kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]); - } - } -)"; - -static const char *scexmul_non_uniform_source = R"( - __kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]); - } - } -)"; - -static const char *scexand_non_uniform_source = R"( - __kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]); - } - } -)"; - -static const char *scexor_non_uniform_source = R"( - __kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]); - } - } -)"; - -static const char *scexxor_non_uniform_source = R"( - __kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]); - } - } -)"; - -static const char *scexand_non_uniform_logical_source = R"( - __kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]); - } - } -)"; - -static const char *scexor_non_uniform_logical_source = R"( - __kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]); - } - } -)"; - -static const char *scexxor_non_uniform_logical_source = R"( - __kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]); - } - } -)"; - -static const char *redadd_non_uniform_source = R"( - __kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_add(in[gid]); - } - } -)"; - -static const char *redmax_non_uniform_source = R"( - __kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_max(in[gid]); - } - } -)"; - -static const char *redmin_non_uniform_source = R"( - __kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_min(in[gid]); - } - } -)"; - -static const char *redmul_non_uniform_source = R"( - __kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_mul(in[gid]); - } - } -)"; - -static const char *redand_non_uniform_source = R"( - __kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_and(in[gid]); - } - } -)"; - -static const char *redor_non_uniform_source = R"( - __kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_or(in[gid]); - } - } -)"; - -static const char *redxor_non_uniform_source = R"( - __kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_xor(in[gid]); - } - } -)"; - -static const char *redand_non_uniform_logical_source = R"( - __kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]); - } - } -)"; - -static const char *redor_non_uniform_logical_source = R"( - __kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]); - } - } -)"; - -static const char *redxor_non_uniform_logical_source = R"( - __kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - int elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]); - } + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if(subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.w; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.z; + } + if (elect_work_item & work_item_mask){ + out[gid] = %s(in[gid]); + } } )"; @@ -354,52 +47,52 @@ template int run_functions_add_mul_max_min_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_scinadd_non_uniform", scinadd_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_add"); error |= rft.run_impl>( - "test_scinmul_non_uniform", scinmul_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_mul"); error |= rft.run_impl>( - "test_scinmax_non_uniform", scinmax_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_max"); error |= rft.run_impl>( - "test_scinmin_non_uniform", scinmin_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_min"); error |= rft.run_impl>( - "test_scexadd_non_uniform", scexadd_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_add"); error |= rft.run_impl>( - "test_scexmul_non_uniform", scexmul_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_mul"); error |= rft.run_impl>( - "test_scexmax_non_uniform", scexmax_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_max"); error |= rft.run_impl>( - "test_scexmin_non_uniform", scexmin_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_min"); error |= rft.run_impl>( - "test_redadd_non_uniform", redadd_non_uniform_source); + "sub_group_non_uniform_reduce_add"); error |= rft.run_impl>( - "test_redmul_non_uniform", redmul_non_uniform_source); + "sub_group_non_uniform_reduce_mul"); error |= rft.run_impl>( - "test_redmax_non_uniform", redmax_non_uniform_source); + "sub_group_non_uniform_reduce_max"); error |= rft.run_impl>( - "test_redmin_non_uniform", redmin_non_uniform_source); + "sub_group_non_uniform_reduce_min"); return error; } template int run_functions_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_scinand_non_uniform", scinand_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_and"); error |= rft.run_impl>( - "test_scinor_non_uniform", scinor_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_or"); error |= rft.run_impl>( - "test_scinxor_non_uniform", scinxor_non_uniform_source); + "sub_group_non_uniform_scan_inclusive_xor"); error |= rft.run_impl>( - "test_scexand_non_uniform", scexand_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_and"); error |= rft.run_impl>( - "test_scexor_non_uniform", scexor_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_or"); error |= rft.run_impl>( - "test_scexxor_non_uniform", scexxor_non_uniform_source); + "sub_group_non_uniform_scan_exclusive_xor"); error |= rft.run_impl>( - "test_redand_non_uniform", redand_non_uniform_source); + "sub_group_non_uniform_reduce_and"); error |= rft.run_impl>( - "test_redor_non_uniform", redor_non_uniform_source); + "sub_group_non_uniform_reduce_or"); error |= rft.run_impl>( - "test_redxor_non_uniform", redxor_non_uniform_source); + "sub_group_non_uniform_reduce_xor"); return error; } @@ -407,23 +100,23 @@ template int run_functions_logical_and_or_xor_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_scinand_non_uniform_logical", scinand_non_uniform_logical_source); + "sub_group_non_uniform_scan_inclusive_logical_and"); error |= rft.run_impl>( - "test_scinor_non_uniform_logical", scinor_non_uniform_logical_source); + "sub_group_non_uniform_scan_inclusive_logical_or"); error |= rft.run_impl>( - "test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source); + "sub_group_non_uniform_scan_inclusive_logical_xor"); error |= rft.run_impl>( - "test_scexand_non_uniform_logical", scexand_non_uniform_logical_source); + "sub_group_non_uniform_scan_exclusive_logical_and"); error |= rft.run_impl>( - "test_scexor_non_uniform_logical", scexor_non_uniform_logical_source); + "sub_group_non_uniform_scan_exclusive_logical_or"); error |= rft.run_impl>( - "test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source); + "sub_group_non_uniform_scan_exclusive_logical_xor"); error |= rft.run_impl>( - "test_redand_non_uniform_logical", redand_non_uniform_logical_source); + "sub_group_non_uniform_reduce_logical_and"); error |= rft.run_impl>( - "test_redor_non_uniform_logical", redor_non_uniform_logical_source); + "sub_group_non_uniform_reduce_logical_or"); error |= rft.run_impl>( - "test_redxor_non_uniform_logical", redxor_non_uniform_logical_source); + "sub_group_non_uniform_reduce_logical_xor"); return error; } @@ -441,13 +134,11 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device, "this device, skipping test.\n"); return TEST_SKIPPED_ITSELF; } - std::vector masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555, - 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00, - 0x00ffff00, 0x80000000, 0xaaaaaaaa }; constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, masks); + WorkGroupParams test_params(global_work_size, local_work_size, true); + test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_functions_add_mul_max_min_for_type(rft); diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp index 484e9b6b..f956960b 100644 --- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp +++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp @@ -28,7 +28,6 @@ template struct VOTE int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; int non_uniform_size = ng % nw; ng = ng / nw; @@ -40,9 +39,11 @@ template struct VOTE operation_names(operation)); log_info(" test params: global size = %d local size = %d subgroups " - "size = %d work item mask = 0x%x data type (%s)\n", - test_params.global_workgroup_size, nw, ns, work_items_mask, + "size = %d data type (%s)\n", + test_params.global_workgroup_size, nw, ns, TypeManager::name()); + log_info(" work items mask: %s\n", + test_params.work_items_mask.to_string().c_str()); if (non_uniform_size) { log_info(" non uniform work group size mode ON\n"); @@ -99,7 +100,6 @@ template struct VOTE int nw = test_params.local_workgroup_size; int ns = test_params.subgroup_size; int ng = test_params.global_workgroup_size; - uint32_t work_items_mask = test_params.work_items_mask; int nj = (nw + ns - 1) / ns; cl_int tr, rr; int non_uniform_size = ng % nw; @@ -141,8 +141,7 @@ template struct VOTE std::set active_work_items; for (i = 0; i < n; ++i) { - uint32_t check_work_item = 1 << (i % 32); - if (work_items_mask & check_work_item) + if (test_params.work_items_mask.test(i)) { active_work_items.insert(i); switch (operation) @@ -215,46 +214,47 @@ template struct VOTE return TEST_PASS; } }; -static const char *elect_source = R"( - __kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) { + +std::string sub_group_elect_source = R"( + __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) { int gid = get_global_id(0); XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_elect(); - } + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if(subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.w; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.z; + } + if (elect_work_item & work_item_mask){ + out[gid] = sub_group_elect(); + } } )"; -static const char *non_uniform_any_source = R"( - __kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) { +std::string sub_group_non_uniform_any_all_all_equal_source = R"( + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { int gid = get_global_id(0); XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_any(in[gid]); - } - } -)"; - -static const char *non_uniform_all_source = R"( - __kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_all(in[gid]); - } - } -)"; - -static const char *non_uniform_all_equal_source = R"( - __kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) { - int gid = get_global_id(0); - XY(xy,gid); - uint elect_work_item = 1 << (get_sub_group_local_id() % 32); - if (elect_work_item & WORK_ITEMS_MASK){ - out[gid] = sub_group_non_uniform_all_equal(in[gid]); + uint subgroup_local_id = get_sub_group_local_id(); + uint elect_work_item = 1 << (subgroup_local_id % 32); + uint work_item_mask; + if(subgroup_local_id < 32) { + work_item_mask = work_item_mask_vector.x; + } else if(subgroup_local_id < 64) { + work_item_mask = work_item_mask_vector.y; + } else if(subgroup_local_id < 96) { + work_item_mask = work_item_mask_vector.w; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.z; + } + if (elect_work_item & work_item_mask){ + out[gid] = %s(in[gid]); } } )"; @@ -262,7 +262,7 @@ static const char *non_uniform_all_equal_source = R"( template int run_vote_all_equal_for_type(RunTestForType rft) { int error = rft.run_impl>( - "test_non_uniform_all_equal", non_uniform_all_equal_source); + "sub_group_non_uniform_all_equal"); return error; } } @@ -278,12 +278,13 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device, "device, skipping test.\n"); return TEST_SKIPPED_ITSELF; } - std::vector masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555, - 0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00, - 0x00ffff00, 0x80000000 }; + constexpr size_t global_work_size = 170; constexpr size_t local_work_size = 64; - WorkGroupParams test_params(global_work_size, local_work_size, masks); + WorkGroupParams test_params(global_work_size, local_work_size, true); + test_params.save_kernel_source( + sub_group_non_uniform_any_all_all_equal_source); + test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect"); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_vote_all_equal_for_type(rft); @@ -295,10 +296,10 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device, error |= run_vote_all_equal_for_type(rft); error |= rft.run_impl>( - "test_non_uniform_all", non_uniform_all_source); + "sub_group_non_uniform_all"); error |= rft.run_impl>( - "test_elect", elect_source); + "sub_group_elect"); error |= rft.run_impl>( - "test_non_uniform_any", non_uniform_any_source); + "sub_group_non_uniform_any"); return error; } diff --git a/test_conformance/subgroups/test_subgroup_shuffle.cpp b/test_conformance/subgroups/test_subgroup_shuffle.cpp index 37b27ced..56231cbf 100644 --- a/test_conformance/subgroups/test_subgroup_shuffle.cpp +++ b/test_conformance/subgroups/test_subgroup_shuffle.cpp @@ -15,38 +15,19 @@ // #include "procs.h" #include "subhelpers.h" +#include "subgroup_common_kernels.h" #include "subgroup_common_templates.h" #include "harness/typeWrappers.h" #include namespace { -static const char* shuffle_xor_source = - "__kernel void test_sub_group_shuffle_xor(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle_xor(x, xy[gid].z);" - "}\n"; - -static const char* shuffle_source = - "__kernel void test_sub_group_shuffle(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle(x, xy[gid].z);" - "}\n"; - template int run_shuffle_for_type(RunTestForType rft) { - int error = rft.run_impl>( - "test_sub_group_shuffle", shuffle_source); + int error = + rft.run_impl>("sub_group_shuffle"); error |= rft.run_impl>( - "test_sub_group_shuffle_xor", shuffle_xor_source); + "sub_group_shuffle_xor"); return error; } @@ -61,9 +42,11 @@ int test_subgroup_functions_shuffle(cl_device_id device, cl_context context, "skipping test.\n"); return TEST_SKIPPED_ITSELF; } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_generic_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_shuffle_for_type(rft); diff --git a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp index 11401e80..caa1dccc 100644 --- a/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp +++ b/test_conformance/subgroups/test_subgroup_shuffle_relative.cpp @@ -15,37 +15,19 @@ // #include "procs.h" #include "subhelpers.h" +#include "subgroup_common_kernels.h" #include "subgroup_common_templates.h" #include "harness/conversions.h" #include "harness/typeWrappers.h" namespace { -static const char* shuffle_down_source = - "__kernel void test_sub_group_shuffle_down(const __global Type *in, " - "__global int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle_down(x, xy[gid].z);" - "}\n"; -static const char* shuffle_up_source = - "__kernel void test_sub_group_shuffle_up(const __global Type *in, __global " - "int4 *xy, __global Type *out)\n" - "{\n" - " int gid = get_global_id(0);\n" - " XY(xy,gid);\n" - " Type x = in[gid];\n" - " out[gid] = sub_group_shuffle_up(x, xy[gid].z);" - "}\n"; - template int run_shuffle_relative_for_type(RunTestForType rft) { - int error = rft.run_impl>( - "test_sub_group_shuffle_up", shuffle_up_source); + int error = + rft.run_impl>("sub_group_shuffle_up"); error |= rft.run_impl>( - "test_sub_group_shuffle_down", shuffle_down_source); + "sub_group_shuffle_down"); return error; } @@ -62,9 +44,11 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device, "device, skipping test.\n"); return TEST_SKIPPED_ITSELF; } + constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; WorkGroupParams test_params(global_work_size, local_work_size); + test_params.save_kernel_source(sub_group_generic_source); RunTestForType rft(device, context, queue, num_elements, test_params); int error = run_shuffle_relative_for_type(rft);