Extended subgroups - use 128bit masks (#1215)

* Extended subgroups - use 128bit masks

* Refactoring to avoid kernels code duplication

* unification kernel names as test_ prefix +subgroups function name
* use string literals that improve readability
* use kernel templates that limit code duplication
* WorkGroupParams allows define default kernel - kernel template for multiple functions
* WorkGroupParams allows define  kernel for specific one subgroup function

Co-authored-by: Stuart Brady <stuart.brady@arm.com>
This commit is contained in:
Grzegorz Wawiorko
2021-10-01 12:28:37 +02:00
committed by GitHub
parent 903f1bf65d
commit 92844bead1
12 changed files with 592 additions and 1054 deletions

View File

@@ -15,92 +15,20 @@
// //
#include "subgroup_common_kernels.h" #include "subgroup_common_kernels.h"
const char* bcast_source =
"__kernel void test_bcast(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint which_sub_group_local_id = xy[gid].z;\n"
" out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
"}\n"; std::string sub_group_reduction_scan_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
out[gid] = %s(in[gid]);
}
)";
const char* redadd_source = "__kernel void test_redadd(const __global Type " std::string sub_group_generic_source = R"(
"*in, __global int4 *xy, __global Type *out)\n" __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
"{\n" int gid = get_global_id(0);
" int gid = get_global_id(0);\n" XY(xy,gid);
" XY(xy,gid);\n" Type x = in[gid];
" out[gid] = sub_group_reduce_add(in[gid]);\n" out[gid] = %s(x, xy[gid].z);
"}\n"; }
)";
const char* redmax_source = "__kernel void test_redmax(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_max(in[gid]);\n"
"}\n";
const char* redmin_source = "__kernel void test_redmin(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_min(in[gid]);\n"
"}\n";
const char* scinadd_source =
"__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
"}\n";
const char* scinmax_source =
"__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
"}\n";
const char* scinmin_source =
"__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
"}\n";
const char* scexadd_source =
"__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
"}\n";
const char* scexmax_source =
"__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
"}\n";
const char* scexmin_source =
"__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
"}\n";

View File

@@ -18,15 +18,7 @@
#include "subhelpers.h" #include "subhelpers.h"
extern const char* bcast_source; extern std::string sub_group_reduction_scan_source;
extern const char* redadd_source; extern std::string sub_group_generic_source;
extern const char* redmax_source;
extern const char* redmin_source;
extern const char* scinadd_source;
extern const char* scinmax_source;
extern const char* scinmin_source;
extern const char* scexadd_source;
extern const char* scexmax_source;
extern const char* scexmin_source;
#endif #endif

View File

@@ -17,13 +17,10 @@
#define SUBGROUPCOMMONTEMPLATES_H #define SUBGROUPCOMMONTEMPLATES_H
#include "typeWrappers.h" #include "typeWrappers.h"
#include <bitset>
#include "CL/cl_half.h" #include "CL/cl_half.h"
#include "subhelpers.h" #include "subhelpers.h"
#include <set> #include <set>
typedef std::bitset<128> bs128;
static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id, static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
const std::string &mask_type, const std::string &mask_type,
cl_uint max_sub_group_size) cl_uint max_sub_group_size)
@@ -577,16 +574,21 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw; ng = ng / nw;
std::string func_name; std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" test_params.work_items_mask.any()
: func_name = "sub_group_scan_exclusive"; ? func_name = "sub_group_non_uniform_scan_exclusive"
: func_name = "sub_group_scan_exclusive";
log_info(" %s_%s(%s)...\n", func_name.c_str(), log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name()); operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups " log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n", "size = %d \n",
test_params.global_workgroup_size, nw, ns, work_items_mask); test_params.global_workgroup_size, nw, ns);
if (test_params.work_items_mask.any())
{
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
}
genrand<Ty, operation>(x, t, m, ns, nw, ng); genrand<Ty, operation>(x, t, m, ns, nw, ng);
} }
@@ -597,18 +599,22 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask; bs128 work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
Ty tr, rr; Ty tr, rr;
ng = ng / nw; ng = ng / nw;
std::string func_name; std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive" test_params.work_items_mask.any()
: func_name = "sub_group_scan_exclusive"; ? func_name = "sub_group_non_uniform_scan_exclusive"
: func_name = "sub_group_scan_exclusive";
uint32_t use_work_items_mask;
// for uniform case take into consideration all workitems // for uniform case take into consideration all workitems
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; if (!work_items_mask.any())
{
work_items_mask.set();
}
for (k = 0; k < ng; ++k) for (k = 0; k < ng; ++k)
{ // for each work_group { // for each work_group
// Map to array indexed to array indexed by local ID and sub group // Map to array indexed to array indexed by local ID and sub group
@@ -624,8 +630,7 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
std::set<int> active_work_items; std::set<int> active_work_items;
for (i = 0; i < n; ++i) for (i = 0; i < n; ++i)
{ {
uint32_t check_work_item = 1 << (i % 32); if (work_items_mask.test(i))
if (use_work_items_mask & check_work_item)
{ {
active_work_items.insert(i); active_work_items.insert(i);
} }
@@ -688,18 +693,23 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw; ng = ng / nw;
std::string func_name; std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" test_params.work_items_mask.any()
: func_name = "sub_group_scan_inclusive"; ? func_name = "sub_group_non_uniform_scan_inclusive"
: func_name = "sub_group_scan_inclusive";
genrand<Ty, operation>(x, t, m, ns, nw, ng); genrand<Ty, operation>(x, t, m, ns, nw, ng);
log_info(" %s_%s(%s)...\n", func_name.c_str(), log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name()); operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups " log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n", "size = %d \n",
test_params.global_workgroup_size, nw, ns, work_items_mask); test_params.global_workgroup_size, nw, ns);
if (test_params.work_items_mask.any())
{
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
}
} }
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -709,18 +719,22 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask; bs128 work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
Ty tr, rr; Ty tr, rr;
ng = ng / nw; ng = ng / nw;
std::string func_name; std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive" work_items_mask.any()
: func_name = "sub_group_scan_inclusive"; ? func_name = "sub_group_non_uniform_scan_inclusive"
: func_name = "sub_group_scan_inclusive";
uint32_t use_work_items_mask;
// for uniform case take into consideration all workitems // for uniform case take into consideration all workitems
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask; if (!work_items_mask.any())
{
work_items_mask.set();
}
// std::bitset<32> mask32(use_work_items_mask); // std::bitset<32> mask32(use_work_items_mask);
// for (int k) mask32.count(); // for (int k) mask32.count();
for (k = 0; k < ng; ++k) for (k = 0; k < ng; ++k)
@@ -740,8 +754,7 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
for (i = 0; i < n; ++i) for (i = 0; i < n; ++i)
{ {
uint32_t check_work_item = 1 << (i % 32); if (work_items_mask.test(i))
if (use_work_items_mask & check_work_item)
{ {
if (catch_frist_active == -1) if (catch_frist_active == -1)
{ {
@@ -807,17 +820,22 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw; ng = ng / nw;
std::string func_name; std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_reduce" test_params.work_items_mask.any()
: func_name = "sub_group_reduce"; ? func_name = "sub_group_non_uniform_reduce"
: func_name = "sub_group_reduce";
log_info(" %s_%s(%s)...\n", func_name.c_str(), log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name()); operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups " log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n", "size = %d \n",
test_params.global_workgroup_size, nw, ns, work_items_mask); test_params.global_workgroup_size, nw, ns);
if (test_params.work_items_mask.any())
{
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
}
genrand<Ty, operation>(x, t, m, ns, nw, ng); genrand<Ty, operation>(x, t, m, ns, nw, ng);
} }
@@ -828,14 +846,14 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask; bs128 work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
ng = ng / nw; ng = ng / nw;
Ty tr, rr; Ty tr, rr;
std::string func_name; std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_reduce" work_items_mask.any() ? func_name = "sub_group_non_uniform_reduce"
: func_name = "sub_group_reduce"; : func_name = "sub_group_reduce";
for (k = 0; k < ng; ++k) for (k = 0; k < ng; ++k)
{ {
@@ -847,9 +865,10 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
my[j] = y[j]; my[j] = y[j];
} }
uint32_t use_work_items_mask; if (!work_items_mask.any())
use_work_items_mask = {
!work_items_mask ? 0xFFFFFFFF : work_items_mask; work_items_mask.set();
}
for (j = 0; j < nj; ++j) for (j = 0; j < nj; ++j)
{ {
@@ -859,8 +878,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
int catch_frist_active = -1; int catch_frist_active = -1;
for (i = 0; i < n; ++i) for (i = 0; i < n; ++i)
{ {
uint32_t check_work_item = 1 << (i % 32); if (work_items_mask.test(i))
if (use_work_items_mask & check_work_item)
{ {
if (catch_frist_active == -1) if (catch_frist_active == -1)
{ {

View File

@@ -24,31 +24,172 @@
#include <limits> #include <limits>
#include <vector> #include <vector>
#include <type_traits> #include <type_traits>
#include <bitset>
#include <regex>
#include <map>
#define NR_OF_ACTIVE_WORK_ITEMS 4 #define NR_OF_ACTIVE_WORK_ITEMS 4
extern MTdata gMTdata; extern MTdata gMTdata;
typedef std::bitset<128> bs128;
extern cl_half_rounding_mode g_rounding_mode; extern cl_half_rounding_mode g_rounding_mode;
struct WorkGroupParams struct WorkGroupParams
{ {
WorkGroupParams(size_t gws, size_t lws, WorkGroupParams(size_t gws, size_t lws,
const std::vector<uint32_t> &all_wim = {}) bool use_mask = false)
: global_workgroup_size(gws), local_workgroup_size(lws), : global_workgroup_size(gws), local_workgroup_size(lws),
all_work_item_masks(all_wim) use_masks(use_mask)
{ {
subgroup_size = 0; subgroup_size = 0;
work_items_mask = 0; work_items_mask = 0;
use_core_subgroups = true; use_core_subgroups = true;
dynsc = 0; dynsc = 0;
load_masks();
} }
size_t global_workgroup_size; size_t global_workgroup_size;
size_t local_workgroup_size; size_t local_workgroup_size;
size_t subgroup_size; size_t subgroup_size;
uint32_t work_items_mask; bs128 work_items_mask;
int dynsc; int dynsc;
bool use_core_subgroups; bool use_core_subgroups;
std::vector<uint32_t> all_work_item_masks; std::vector<bs128> all_work_item_masks;
bool use_masks;
void save_kernel_source(const std::string &source, std::string name = "")
{
if (name == "")
{
name = "default";
}
if (kernel_function_name.find(name) != kernel_function_name.end())
{
log_info("Kernel definition duplication. Source will be "
"overwritten for function name %s",
name.c_str());
}
kernel_function_name[name] = source;
};
// return specific defined kernel or default.
std::string get_kernel_source(std::string name)
{
if (kernel_function_name.find(name) == kernel_function_name.end())
{
return kernel_function_name["default"];
}
return kernel_function_name[name];
}
private:
std::map<std::string, std::string> kernel_function_name;
void load_masks()
{
if (use_masks)
{
// 1 in string will be set 1, 0 will be set 0
bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000",
128, '0', '1');
all_work_item_masks.push_back(mask_0xf0f0f0f0);
// 1 in string will be set 0, 0 will be set 1
bs128 mask_0x0f0f0f0f("11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000",
128, '1', '0');
all_work_item_masks.push_back(mask_0x0f0f0f0f);
bs128 mask_0x5555aaaa("10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010",
128, '0', '1');
all_work_item_masks.push_back(mask_0x5555aaaa);
bs128 mask_0xaaaa5555("10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010",
128, '1', '0');
all_work_item_masks.push_back(mask_0xaaaa5555);
// 0x0f0ff0f0
bs128 mask_0x0f0ff0f0("00001111000011111111000011110000"
"00001111000011111111000011110000"
"00001111000011111111000011110000"
"00001111000011111111000011110000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x0f0ff0f0);
// 0xff0000ff
bs128 mask_0xff0000ff("11111111000000000000000011111111"
"11111111000000000000000011111111"
"11111111000000000000000011111111"
"11111111000000000000000011111111",
128, '0', '1');
all_work_item_masks.push_back(mask_0xff0000ff);
// 0xff00ff00
bs128 mask_0xff00ff00("11111111000000001111111100000000"
"11111111000000001111111100000000"
"11111111000000001111111100000000"
"11111111000000001111111100000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0xff00ff00);
// 0x00ffff00
bs128 mask_0x00ffff00("00000000111111111111111100000000"
"00000000111111111111111100000000"
"00000000111111111111111100000000"
"00000000111111111111111100000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x00ffff00);
// 0x80 1 workitem highest id for 8 subgroup size
bs128 mask_0x80808080("10000000100000001000000010000000"
"10000000100000001000000010000000"
"10000000100000001000000010000000"
"10000000100000001000000010000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x80808080);
// 0x8000 1 workitem highest id for 16 subgroup size
bs128 mask_0x80008000("10000000000000001000000000000000"
"10000000000000001000000000000000"
"10000000000000001000000000000000"
"10000000000000001000000000000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x80008000);
// 0x80000000 1 workitem highest id for 32 subgroup size
bs128 mask_0x80000000("10000000000000000000000000000000"
"10000000000000000000000000000000"
"10000000000000000000000000000000"
"10000000000000000000000000000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x80000000);
// 0x80000000 00000000 1 workitem highest id for 64 subgroup size
// 0x80000000 1 workitem highest id for 32 subgroup size
bs128 mask_0x8000000000000000("10000000000000000000000000000000"
"00000000000000000000000000000000"
"10000000000000000000000000000000"
"00000000000000000000000000000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x8000000000000000);
// 0x80000000 00000000 00000000 00000000 1 workitem highest id for
// 128 subgroup size
bs128 mask_0x80000000000000000000000000000000(
"10000000000000000000000000000000"
"00000000000000000000000000000000"
"00000000000000000000000000000000"
"00000000000000000000000000000000",
128, '0', '1');
all_work_item_masks.push_back(
mask_0x80000000000000000000000000000000);
bs128 mask_0xffffffff("11111111111111111111111111111111"
"11111111111111111111111111111111"
"11111111111111111111111111111111"
"11111111111111111111111111111111",
128, '0', '1');
all_work_item_masks.push_back(mask_0xffffffff);
}
}
}; };
enum class SubgroupsBroadcastOp enum class SubgroupsBroadcastOp
@@ -1267,11 +1408,23 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
std::vector<Ty> mapout; std::vector<Ty> mapout;
mapout.resize(local); mapout.resize(local);
std::stringstream kernel_sstr; std::stringstream kernel_sstr;
if (test_params.work_items_mask != 0) if (test_params.use_masks)
{ {
kernel_sstr << "#define WORK_ITEMS_MASK "; // Prapare uint4 type to store bitmask on kernel OpenCL C side
kernel_sstr << "0x" << std::hex << test_params.work_items_mask // To keep order the first characet in string is the lowest bit
<< "\n"; // there was a need to give such offset to bitset constructor
// (first highest offset = 96)
std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(),
96, 32);
std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(),
64, 32);
std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(),
32, 32);
std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(),
0, 32);
kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b"
<< bits_1_32 << ",0b" << bits_33_64 << ",0b"
<< bits_65_96 << ",0b" << bits_97_128 << ");\n";
} }
@@ -1452,18 +1605,24 @@ struct RunTestForType
num_elements_(num_elements), test_params_(test_params) num_elements_(num_elements), test_params_(test_params)
{} {}
template <typename T, typename U> template <typename T, typename U>
int run_impl(const char *kernel_name, const char *source) int run_impl(const std::string &function_name)
{ {
int error = TEST_PASS; int error = TEST_PASS;
std::string source =
std::regex_replace(test_params_.get_kernel_source(function_name),
std::regex("\\%s"), function_name);
std::string kernel_name = "test_" + function_name;
if (test_params_.all_work_item_masks.size() > 0) if (test_params_.all_work_item_masks.size() > 0)
{ {
error = test<T, U>::mrun(device_, context_, queue_, num_elements_, error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
kernel_name, source, test_params_); kernel_name.c_str(), source.c_str(),
test_params_);
} }
else else
{ {
error = test<T, U>::run(device_, context_, queue_, num_elements_, error = test<T, U>::run(device_, context_, queue_, num_elements_,
kernel_name, source, test_params_); kernel_name.c_str(), source.c_str(),
test_params_);
} }
return error; return error;

View File

@@ -150,25 +150,25 @@ template <typename T>
int run_broadcast_scan_reduction_for_type(RunTestForType rft) int run_broadcast_scan_reduction_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>( int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
"test_bcast", bcast_source); "sub_group_broadcast");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd", error |=
redadd_source); rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax", error |=
redmax_source); rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin", error |=
redmin_source); rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd", error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
scinadd_source); "sub_group_scan_inclusive_add");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax", error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
scinmax_source); "sub_group_scan_inclusive_max");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin", error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
scinmin_source); "sub_group_scan_inclusive_min");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd", error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
scexadd_source); "sub_group_scan_exclusive_add");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax", error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
scexmax_source); "sub_group_scan_exclusive_max");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin", error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
scexmin_source); "sub_group_scan_exclusive_min");
return error; return error;
} }
@@ -181,11 +181,14 @@ int test_subgroup_functions(cl_device_id device, cl_context context,
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size); WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_reduction_scan_source);
test_params.save_kernel_source(sub_group_generic_source,
"sub_group_broadcast");
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
int error = int error =
rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source); rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("sub_group_any");
error |= error |= rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("sub_group_all");
rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
error |= run_broadcast_scan_reduction_for_type<cl_int>(rft); error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft); error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_long>(rft); error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);

View File

@@ -684,239 +684,127 @@ template <typename Ty, BallotOp operation> struct SMASK
} }
}; };
static const char *bcast_non_uniform_source = std::string sub_group_non_uniform_broadcast_source = R"(
"__kernel void test_bcast_non_uniform(const __global Type *in, __global " __kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) {
"int4 *xy, __global Type *out)\n" int gid = get_global_id(0);
"{\n" XY(xy,gid);
" int gid = get_global_id(0);\n" Type x = in[gid];
" XY(xy,gid);\n" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
" Type x = in[gid];\n" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);
" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" } else {
" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);
" } else {\n" }
" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n" }
" }\n" )";
"}\n"; std::string sub_group_broadcast_first_source = R"(
__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
out[gid] = sub_group_broadcast_first(x);;
} else {
out[gid] = sub_group_broadcast_first(x);;
}
}
)";
std::string sub_group_ballot_bit_scan_find_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
uint4 value = (uint4)(0,0,0,0);
value = (uint4)(%s(x),0,0,0);
out[gid] = value;
}
)";
std::string sub_group_ballot_mask_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
xy[gid].z = get_max_sub_group_size();
Type x = in[gid];
uint4 mask = %s();
out[gid] = mask;
}
)";
std::string sub_group_ballot_source = R"(
__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
uint4 full_ballot = sub_group_ballot(1);
uint divergence_mask;
uint4 partial_ballot;
uint gid = get_global_id(0);
XY(xy,gid);
if (get_sub_group_local_id() & 1) {
divergence_mask = 0xaaaaaaaa;
partial_ballot = sub_group_ballot(1);
} else {
divergence_mask = 0x55555555;
partial_ballot = sub_group_ballot(1);
}
size_t lws = get_local_size(0);
uint4 masked_ballot = full_ballot;
masked_ballot.x &= divergence_mask;
masked_ballot.y &= divergence_mask;
masked_ballot.z &= divergence_mask;
masked_ballot.w &= divergence_mask;
out[gid] = all(masked_ballot == partial_ballot);
static const char *bcast_first_source = }
"__kernel void test_bcast_first(const __global Type *in, __global int4 " )";
"*xy, __global Type *out)\n" std::string sub_group_inverse_ballot_source = R"(
"{\n" __kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
" int gid = get_global_id(0);\n" int gid = get_global_id(0);
" XY(xy,gid);\n" XY(xy,gid);
" Type x = in[gid];\n" Type x = in[gid];
" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n" uint4 value = (uint4)(10,0,0,0);
" out[gid] = sub_group_broadcast_first(x);\n" if (get_sub_group_local_id() & 1) {
" } else {\n" uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);
" out[gid] = sub_group_broadcast_first(x);\n" if (sub_group_inverse_ballot(partial_ballot_mask)) {
" }\n" value = (uint4)(1,0,0,1);
"}\n"; } else {
value = (uint4)(0,0,0,1);
static const char *ballot_bit_count_source = }
"__kernel void test_sub_group_ballot_bit_count(const __global Type *in, " } else {
"__global int4 *xy, __global Type *out)\n" uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555);
"{\n" if (sub_group_inverse_ballot(partial_ballot_mask)) {
" int gid = get_global_id(0);\n" value = (uint4)(1,0,0,2);
" XY(xy,gid);\n" } else {
" Type x = in[gid];\n" value = (uint4)(0,0,0,2);
" uint4 value = (uint4)(0,0,0,0);\n" }
" value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n" }
" out[gid] = value;\n" out[gid] = value;
"}\n"; }
)";
static const char *ballot_inclusive_scan_source = std::string sub_group_ballot_bit_extract_source = R"(
"__kernel void test_sub_group_ballot_inclusive_scan(const __global Type " __kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) {
"*in, __global int4 *xy, __global Type *out)\n" int gid = get_global_id(0);
"{\n" XY(xy,gid);
" int gid = get_global_id(0);\n" Type x = in[gid];
" XY(xy,gid);\n" uint index = xy[gid].z;
" Type x = in[gid];\n" uint4 value = (uint4)(10,0,0,0);
" uint4 value = (uint4)(0,0,0,0);\n" if (get_sub_group_local_id() & 1) {
" value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n" if (sub_group_ballot_bit_extract(x, xy[gid].z)) {
" out[gid] = value;\n" value = (uint4)(1,0,0,1);
"}\n"; } else {
value = (uint4)(0,0,0,1);
static const char *ballot_exclusive_scan_source = }
"__kernel void test_sub_group_ballot_exclusive_scan(const __global Type " } else {
"*in, __global int4 *xy, __global Type *out)\n" if (sub_group_ballot_bit_extract(x, xy[gid].w)) {
"{\n" value = (uint4)(1,0,0,2);
" int gid = get_global_id(0);\n" } else {
" XY(xy,gid);\n" value = (uint4)(0,0,0,2);
" Type x = in[gid];\n" }
" uint4 value = (uint4)(0,0,0,0);\n" }
" value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n" out[gid] = value;
" out[gid] = value;\n" }
"}\n"; )";
static const char *ballot_find_lsb_source =
"__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);\n"
" value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_find_msb_source =
"__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);"
" value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
" out[gid] = value ;"
"}\n";
static const char *get_subgroup_ge_mask_source =
"__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_ge_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_gt_mask_source =
"__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_gt_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_le_mask_source =
"__kernel void test_get_sub_group_le_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_le_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_lt_mask_source =
"__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_lt_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_eq_mask_source =
"__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_eq_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *ballot_source =
"__kernel void test_sub_group_ballot(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
"uint4 full_ballot = sub_group_ballot(1);\n"
"uint divergence_mask;\n"
"uint4 partial_ballot;\n"
"uint gid = get_global_id(0);"
"XY(xy,gid);\n"
"if (get_sub_group_local_id() & 1) {\n"
" divergence_mask = 0xaaaaaaaa;\n"
" partial_ballot = sub_group_ballot(1);\n"
"} else {\n"
" divergence_mask = 0x55555555;\n"
" partial_ballot = sub_group_ballot(1);\n"
"}\n"
" size_t lws = get_local_size(0);\n"
"uint4 masked_ballot = full_ballot;\n"
"masked_ballot.x &= divergence_mask;\n"
"masked_ballot.y &= divergence_mask;\n"
"masked_ballot.z &= divergence_mask;\n"
"masked_ballot.w &= divergence_mask;\n"
"out[gid] = all(masked_ballot == partial_ballot);\n"
"} \n";
static const char *ballot_source_inverse =
"__kernel void test_sub_group_ballot_inverse(const __global "
"Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(10,0,0,0);\n"
" if (get_sub_group_local_id() & 1) {"
" uint4 partial_ballot_mask = "
"(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);"
" if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
" value = (uint4)(1,0,0,1);\n"
" } else {\n"
" value = (uint4)(0,0,0,1);\n"
" }\n"
" } else {\n"
" uint4 partial_ballot_mask = "
"(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
" if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
" value = (uint4)(1,0,0,2);\n"
" } else {\n"
" value = (uint4)(0,0,0,2);\n"
" }\n"
" }\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_bit_extract_source =
"__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint index = xy[gid].z;\n"
" uint4 value = (uint4)(10,0,0,0);\n"
" if (get_sub_group_local_id() & 1) {"
" if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
" value = (uint4)(1,0,0,1);\n"
" } else {\n"
" value = (uint4)(0,0,0,1);\n"
" }\n"
" } else {\n"
" if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
" value = (uint4)(1,0,0,2);\n"
" } else {\n"
" value = (uint4)(0,0,0,2);\n"
" }\n"
" }\n"
" out[gid] = value;\n"
"}\n";
template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft) template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
{ {
int error = int error =
rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>( rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
"test_bcast_non_uniform", bcast_non_uniform_source); "sub_group_non_uniform_broadcast");
return error; return error;
} }
@@ -932,9 +820,15 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
"skipping test.\n"); "skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
constexpr size_t global_work_size = 170; constexpr size_t global_work_size = 170;
constexpr size_t local_work_size = 64; constexpr size_t local_work_size = 64;
WorkGroupParams test_params(global_work_size, local_work_size); WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_ballot_mask_source);
test_params.save_kernel_source(sub_group_non_uniform_broadcast_source,
"sub_group_non_uniform_broadcast");
test_params.save_kernel_source(sub_group_broadcast_first_source,
"sub_group_broadcast_first");
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
// non uniform broadcast functions // non uniform broadcast functions
@@ -1018,76 +912,87 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
// broadcast first functions // broadcast first functions
error |= error |=
rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>( rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_uint, error |= rft.run_impl<cl_uint,
BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_long, error |= rft.run_impl<cl_long,
BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_ulong, error |= rft.run_impl<cl_ulong,
BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_short, error |= rft.run_impl<cl_short,
BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_ushort, error |= rft.run_impl<cl_ushort,
BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_char, error |= rft.run_impl<cl_char,
BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_uchar, error |= rft.run_impl<cl_uchar,
BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_float, error |= rft.run_impl<cl_float,
BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl<cl_double, error |= rft.run_impl<cl_double,
BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>( BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
error |= rft.run_impl< error |= rft.run_impl<
subgroups::cl_half, subgroups::cl_half,
BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>( BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source); "sub_group_broadcast_first");
// mask functions // mask functions
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>( error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
"test_get_sub_group_eq_mask", get_subgroup_eq_mask_source); "get_sub_group_eq_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>( error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
"test_get_sub_group_ge_mask", get_subgroup_ge_mask_source); "get_sub_group_ge_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>( error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
"test_get_sub_group_gt_mask", get_subgroup_gt_mask_source); "get_sub_group_gt_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>( error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
"test_get_sub_group_le_mask", get_subgroup_le_mask_source); "get_sub_group_le_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>( error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
"test_get_sub_group_lt_mask", get_subgroup_lt_mask_source); "get_sub_group_lt_mask");
// ballot functions // ballot functions
error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot", WorkGroupParams test_params_ballot(global_work_size, local_work_size);
ballot_source); test_params_ballot.save_kernel_source(
error |= rft.run_impl<cl_uint4, sub_group_ballot_bit_scan_find_source);
BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>( test_params_ballot.save_kernel_source(sub_group_ballot_source,
"test_sub_group_ballot_inverse", ballot_source_inverse); "sub_group_ballot");
error |= rft.run_impl< test_params_ballot.save_kernel_source(sub_group_inverse_ballot_source,
"sub_group_inverse_ballot");
test_params_ballot.save_kernel_source(sub_group_ballot_bit_extract_source,
"sub_group_ballot_bit_extract");
RunTestForType rft_ballot(device, context, queue, num_elements,
test_params_ballot);
error |= rft_ballot.run_impl<cl_uint, BALLOT<cl_uint>>("sub_group_ballot");
error |=
rft_ballot.run_impl<cl_uint4,
BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
"sub_group_inverse_ballot");
error |= rft_ballot.run_impl<
cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>( cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
"test_sub_group_ballot_bit_extract", ballot_bit_extract_source); "sub_group_ballot_bit_extract");
error |= rft.run_impl< error |= rft_ballot.run_impl<
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>( cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
"test_sub_group_ballot_bit_count", ballot_bit_count_source); "sub_group_ballot_bit_count");
error |= rft.run_impl< error |= rft_ballot.run_impl<
cl_uint4, cl_uint4,
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>( BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
"test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source); "sub_group_ballot_inclusive_scan");
error |= rft.run_impl< error |= rft_ballot.run_impl<
cl_uint4, cl_uint4,
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>( BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
"test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source); "sub_group_ballot_exclusive_scan");
error |= rft.run_impl< error |= rft_ballot.run_impl<
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>( cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
"test_sub_group_ballot_find_lsb", ballot_find_lsb_source); "sub_group_ballot_find_lsb");
error |= rft.run_impl< error |= rft_ballot.run_impl<
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>( cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
"test_sub_group_ballot_find_msb", ballot_find_msb_source); "sub_group_ballot_find_msb");
return error; return error;
} }

View File

@@ -22,149 +22,17 @@
#define CLUSTER_SIZE_STR "4" #define CLUSTER_SIZE_STR "4"
namespace { namespace {
static const char *redadd_clustered_source = std::string sub_group_clustered_reduce_source = R"(
"__kernel void test_redadd_clustered(const __global Type *in, __global " __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
"int4 *xy, __global Type *out)\n" int gid = get_global_id(0);
"{\n" XY(xy,gid);
" int gid = get_global_id(0);\n" xy[gid].w = 0;
" XY(xy,gid);\n" if (sizeof(in[gid]) == sizeof(%s(in[gid], )" CLUSTER_SIZE_STR R"())) {
" xy[gid].w = 0;\n" xy[gid].w = sizeof(in[gid]);
" if (sizeof(in[gid]) == " }
"sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n" out[gid] = %s(in[gid], )" CLUSTER_SIZE_STR R"();
" {xy[gid].w = sizeof(in[gid]);}\n" }
" out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR )";
");\n"
"}\n";
static const char *redmax_clustered_source =
"__kernel void test_redmax_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmin_clustered_source =
"__kernel void test_redmin_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmul_clustered_source =
"__kernel void test_redmul_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redand_clustered_source =
"__kernel void test_redand_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redor_clustered_source =
"__kernel void test_redor_clustered(const __global Type *in, __global int4 "
"*xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redxor_clustered_source =
"__kernel void test_redxor_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redand_clustered_logical_source =
"__kernel void test_redand_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
static const char *redor_clustered_logical_source =
"__kernel void test_redor_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
static const char *redxor_clustered_logical_source =
"__kernel void test_redxor_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if ( sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
// DESCRIPTION: // DESCRIPTION:
// Test for reduce cluster functions // Test for reduce cluster functions
@@ -267,34 +135,34 @@ template <typename T>
int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft) int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>( int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
"test_redadd_clustered", redadd_clustered_source); "sub_group_clustered_reduce_add");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
"test_redmax_clustered", redmax_clustered_source); "sub_group_clustered_reduce_max");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
"test_redmin_clustered", redmin_clustered_source); "sub_group_clustered_reduce_min");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
"test_redmul_clustered", redmul_clustered_source); "sub_group_clustered_reduce_mul");
return error; return error;
} }
template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft) template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>( int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
"test_redand_clustered", redand_clustered_source); "sub_group_clustered_reduce_and");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
"test_redor_clustered", redor_clustered_source); "sub_group_clustered_reduce_or");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
"test_redxor_clustered", redxor_clustered_source); "sub_group_clustered_reduce_xor");
return error; return error;
} }
template <typename T> template <typename T>
int run_cluster_logical_and_or_xor_for_type(RunTestForType rft) int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>( int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
"test_redand_clustered_logical", redand_clustered_logical_source); "sub_group_clustered_reduce_logical_and");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
"test_redor_clustered_logical", redor_clustered_logical_source); "sub_group_clustered_reduce_logical_or");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>( error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
"test_redxor_clustered_logical", redxor_clustered_logical_source); "sub_group_clustered_reduce_logical_xor");
return error; return error;
} }
@@ -311,9 +179,11 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device,
"device, skipping test.\n"); "device, skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size); WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_clustered_reduce_source);
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft); int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);

View File

@@ -24,30 +24,30 @@ namespace {
template <typename T> int run_broadcast_for_extended_type(RunTestForType rft) template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>( int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
"test_bcast", bcast_source); "sub_group_broadcast");
return error; return error;
} }
template <typename T> int run_scan_reduction_for_type(RunTestForType rft) template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd", int error =
redadd_source); rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax", error |=
redmax_source); rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin", error |=
redmin_source); rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd", error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
scinadd_source); "sub_group_scan_inclusive_add");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax", error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
scinmax_source); "sub_group_scan_inclusive_max");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin", error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
scinmin_source); "sub_group_scan_inclusive_min");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd", error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
scexadd_source); "sub_group_scan_exclusive_add");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax", error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
scexmax_source); "sub_group_scan_exclusive_max");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin", error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
scexmin_source); "sub_group_scan_exclusive_min");
return error; return error;
} }
@@ -65,11 +65,15 @@ int test_subgroup_functions_extended_types(cl_device_id device,
"device, skipping test.\n"); "device, skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size); WorkGroupParams test_params(global_work_size, local_work_size);
RunTestForType rft(device, context, queue, num_elements, test_params); test_params.save_kernel_source(sub_group_reduction_scan_source);
test_params.save_kernel_source(sub_group_generic_source,
"sub_group_broadcast");
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_broadcast_for_extended_type<cl_uint2>(rft); int error = run_broadcast_for_extended_type<cl_uint2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft); error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
error |= run_broadcast_for_extended_type<cl_uint4>(rft); error |= run_broadcast_for_extended_type<cl_uint4>(rft);

View File

@@ -17,336 +17,29 @@
#include "subhelpers.h" #include "subhelpers.h"
#include "harness/typeWrappers.h" #include "harness/typeWrappers.h"
#include "subgroup_common_templates.h" #include "subgroup_common_templates.h"
#include <cstdio>
namespace { namespace {
static const char *scinadd_non_uniform_source = R"( std::string sub_group_non_uniform_arithmetic_source = R"(
__kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0); int gid = get_global_id(0);
XY(xy,gid); XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32); uint subgroup_local_id = get_sub_group_local_id();
if (elect_work_item & WORK_ITEMS_MASK){ uint elect_work_item = 1 << (subgroup_local_id % 32);
out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]); uint work_item_mask;
} if(subgroup_local_id < 32) {
} work_item_mask = work_item_mask_vector.x;
)"; } else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y;
static const char *scinmax_non_uniform_source = R"( } else if(subgroup_local_id < 96) {
__kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) { work_item_mask = work_item_mask_vector.w;
int gid = get_global_id(0); } else if(subgroup_local_id < 128) {
XY(xy,gid); work_item_mask = work_item_mask_vector.z;
int elect_work_item = 1 << (get_sub_group_local_id() % 32); }
if (elect_work_item & WORK_ITEMS_MASK){ if (elect_work_item & work_item_mask){
out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]); out[gid] = %s(in[gid]);
} }
}
)";
static const char *scinmin_non_uniform_source = R"(
__kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
}
}
)";
static const char *scinmul_non_uniform_source = R"(
__kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
}
}
)";
static const char *scinand_non_uniform_source = R"(
__kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
}
}
)";
static const char *scinor_non_uniform_source = R"(
__kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
}
}
)";
static const char *scinxor_non_uniform_source = R"(
__kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
}
}
)";
static const char *scinand_non_uniform_logical_source = R"(
__kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
}
}
)";
static const char *scinor_non_uniform_logical_source = R"(
__kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
}
}
)";
static const char *scinxor_non_uniform_logical_source = R"(
__kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
}
}
)";
static const char *scexadd_non_uniform_source = R"(
__kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
}
}
)";
static const char *scexmax_non_uniform_source = R"(
__kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
}
}
)";
static const char *scexmin_non_uniform_source = R"(
__kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
}
}
)";
static const char *scexmul_non_uniform_source = R"(
__kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
}
}
)";
static const char *scexand_non_uniform_source = R"(
__kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
}
}
)";
static const char *scexor_non_uniform_source = R"(
__kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
}
}
)";
static const char *scexxor_non_uniform_source = R"(
__kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
}
}
)";
static const char *scexand_non_uniform_logical_source = R"(
__kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
}
}
)";
static const char *scexor_non_uniform_logical_source = R"(
__kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
}
}
)";
static const char *scexxor_non_uniform_logical_source = R"(
__kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
}
}
)";
static const char *redadd_non_uniform_source = R"(
__kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
}
}
)";
static const char *redmax_non_uniform_source = R"(
__kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
}
}
)";
static const char *redmin_non_uniform_source = R"(
__kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
}
}
)";
static const char *redmul_non_uniform_source = R"(
__kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
}
}
)";
static const char *redand_non_uniform_source = R"(
__kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
}
}
)";
static const char *redor_non_uniform_source = R"(
__kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
}
}
)";
static const char *redxor_non_uniform_source = R"(
__kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
}
}
)";
static const char *redand_non_uniform_logical_source = R"(
__kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
}
}
)";
static const char *redor_non_uniform_logical_source = R"(
__kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
}
}
)";
static const char *redxor_non_uniform_logical_source = R"(
__kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
}
} }
)"; )";
@@ -354,52 +47,52 @@ template <typename T>
int run_functions_add_mul_max_min_for_type(RunTestForType rft) int run_functions_add_mul_max_min_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>( int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
"test_scinadd_non_uniform", scinadd_non_uniform_source); "sub_group_non_uniform_scan_inclusive_add");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
"test_scinmul_non_uniform", scinmul_non_uniform_source); "sub_group_non_uniform_scan_inclusive_mul");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
"test_scinmax_non_uniform", scinmax_non_uniform_source); "sub_group_non_uniform_scan_inclusive_max");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
"test_scinmin_non_uniform", scinmin_non_uniform_source); "sub_group_non_uniform_scan_inclusive_min");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
"test_scexadd_non_uniform", scexadd_non_uniform_source); "sub_group_non_uniform_scan_exclusive_add");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
"test_scexmul_non_uniform", scexmul_non_uniform_source); "sub_group_non_uniform_scan_exclusive_mul");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
"test_scexmax_non_uniform", scexmax_non_uniform_source); "sub_group_non_uniform_scan_exclusive_max");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
"test_scexmin_non_uniform", scexmin_non_uniform_source); "sub_group_non_uniform_scan_exclusive_min");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
"test_redadd_non_uniform", redadd_non_uniform_source); "sub_group_non_uniform_reduce_add");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
"test_redmul_non_uniform", redmul_non_uniform_source); "sub_group_non_uniform_reduce_mul");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
"test_redmax_non_uniform", redmax_non_uniform_source); "sub_group_non_uniform_reduce_max");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
"test_redmin_non_uniform", redmin_non_uniform_source); "sub_group_non_uniform_reduce_min");
return error; return error;
} }
template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft) template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>( int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
"test_scinand_non_uniform", scinand_non_uniform_source); "sub_group_non_uniform_scan_inclusive_and");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
"test_scinor_non_uniform", scinor_non_uniform_source); "sub_group_non_uniform_scan_inclusive_or");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
"test_scinxor_non_uniform", scinxor_non_uniform_source); "sub_group_non_uniform_scan_inclusive_xor");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
"test_scexand_non_uniform", scexand_non_uniform_source); "sub_group_non_uniform_scan_exclusive_and");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
"test_scexor_non_uniform", scexor_non_uniform_source); "sub_group_non_uniform_scan_exclusive_or");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
"test_scexxor_non_uniform", scexxor_non_uniform_source); "sub_group_non_uniform_scan_exclusive_xor");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
"test_redand_non_uniform", redand_non_uniform_source); "sub_group_non_uniform_reduce_and");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
"test_redor_non_uniform", redor_non_uniform_source); "sub_group_non_uniform_reduce_or");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
"test_redxor_non_uniform", redxor_non_uniform_source); "sub_group_non_uniform_reduce_xor");
return error; return error;
} }
@@ -407,23 +100,23 @@ template <typename T>
int run_functions_logical_and_or_xor_for_type(RunTestForType rft) int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>( int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
"test_scinand_non_uniform_logical", scinand_non_uniform_logical_source); "sub_group_non_uniform_scan_inclusive_logical_and");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
"test_scinor_non_uniform_logical", scinor_non_uniform_logical_source); "sub_group_non_uniform_scan_inclusive_logical_or");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>( error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
"test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source); "sub_group_non_uniform_scan_inclusive_logical_xor");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
"test_scexand_non_uniform_logical", scexand_non_uniform_logical_source); "sub_group_non_uniform_scan_exclusive_logical_and");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
"test_scexor_non_uniform_logical", scexor_non_uniform_logical_source); "sub_group_non_uniform_scan_exclusive_logical_or");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>( error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
"test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source); "sub_group_non_uniform_scan_exclusive_logical_xor");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
"test_redand_non_uniform_logical", redand_non_uniform_logical_source); "sub_group_non_uniform_reduce_logical_and");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
"test_redor_non_uniform_logical", redor_non_uniform_logical_source); "sub_group_non_uniform_reduce_logical_or");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>( error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
"test_redxor_non_uniform_logical", redxor_non_uniform_logical_source); "sub_group_non_uniform_reduce_logical_xor");
return error; return error;
} }
@@ -441,13 +134,11 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
"this device, skipping test.\n"); "this device, skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
0x00ffff00, 0x80000000, 0xaaaaaaaa };
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size, masks); WorkGroupParams test_params(global_work_size, local_work_size, true);
test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_functions_add_mul_max_min_for_type<cl_int>(rft); int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);

View File

@@ -28,7 +28,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
int non_uniform_size = ng % nw; int non_uniform_size = ng % nw;
ng = ng / nw; ng = ng / nw;
@@ -40,9 +39,11 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
operation_names(operation)); operation_names(operation));
log_info(" test params: global size = %d local size = %d subgroups " log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x data type (%s)\n", "size = %d data type (%s)\n",
test_params.global_workgroup_size, nw, ns, work_items_mask, test_params.global_workgroup_size, nw, ns,
TypeManager<T>::name()); TypeManager<T>::name());
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
if (non_uniform_size) if (non_uniform_size)
{ {
log_info(" non uniform work group size mode ON\n"); log_info(" non uniform work group size mode ON\n");
@@ -99,7 +100,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
int nw = test_params.local_workgroup_size; int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size; int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size; int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
cl_int tr, rr; cl_int tr, rr;
int non_uniform_size = ng % nw; int non_uniform_size = ng % nw;
@@ -141,8 +141,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
std::set<int> active_work_items; std::set<int> active_work_items;
for (i = 0; i < n; ++i) for (i = 0; i < n; ++i)
{ {
uint32_t check_work_item = 1 << (i % 32); if (test_params.work_items_mask.test(i))
if (work_items_mask & check_work_item)
{ {
active_work_items.insert(i); active_work_items.insert(i);
switch (operation) switch (operation)
@@ -215,46 +214,47 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
return TEST_PASS; return TEST_PASS;
} }
}; };
static const char *elect_source = R"(
__kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) { std::string sub_group_elect_source = R"(
__kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0); int gid = get_global_id(0);
XY(xy,gid); XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32); uint subgroup_local_id = get_sub_group_local_id();
if (elect_work_item & WORK_ITEMS_MASK){ uint elect_work_item = 1 << (subgroup_local_id % 32);
out[gid] = sub_group_elect(); uint work_item_mask;
} if(subgroup_local_id < 32) {
work_item_mask = work_item_mask_vector.x;
} else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z;
}
if (elect_work_item & work_item_mask){
out[gid] = sub_group_elect();
}
} }
)"; )";
static const char *non_uniform_any_source = R"( std::string sub_group_non_uniform_any_all_all_equal_source = R"(
__kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) { __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0); int gid = get_global_id(0);
XY(xy,gid); XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32); uint subgroup_local_id = get_sub_group_local_id();
if (elect_work_item & WORK_ITEMS_MASK){ uint elect_work_item = 1 << (subgroup_local_id % 32);
out[gid] = sub_group_non_uniform_any(in[gid]); uint work_item_mask;
} if(subgroup_local_id < 32) {
} work_item_mask = work_item_mask_vector.x;
)"; } else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y;
static const char *non_uniform_all_source = R"( } else if(subgroup_local_id < 96) {
__kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) { work_item_mask = work_item_mask_vector.w;
int gid = get_global_id(0); } else if(subgroup_local_id < 128) {
XY(xy,gid); work_item_mask = work_item_mask_vector.z;
uint elect_work_item = 1 << (get_sub_group_local_id() % 32); }
if (elect_work_item & WORK_ITEMS_MASK){ if (elect_work_item & work_item_mask){
out[gid] = sub_group_non_uniform_all(in[gid]); out[gid] = %s(in[gid]);
}
}
)";
static const char *non_uniform_all_equal_source = R"(
__kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_all_equal(in[gid]);
} }
} }
)"; )";
@@ -262,7 +262,7 @@ static const char *non_uniform_all_equal_source = R"(
template <typename T> int run_vote_all_equal_for_type(RunTestForType rft) template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>( int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
"test_non_uniform_all_equal", non_uniform_all_equal_source); "sub_group_non_uniform_all_equal");
return error; return error;
} }
} }
@@ -278,12 +278,13 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
"device, skipping test.\n"); "device, skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
0x00ffff00, 0x80000000 };
constexpr size_t global_work_size = 170; constexpr size_t global_work_size = 170;
constexpr size_t local_work_size = 64; constexpr size_t local_work_size = 64;
WorkGroupParams test_params(global_work_size, local_work_size, masks); WorkGroupParams test_params(global_work_size, local_work_size, true);
test_params.save_kernel_source(
sub_group_non_uniform_any_all_all_equal_source);
test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_vote_all_equal_for_type<cl_int>(rft); int error = run_vote_all_equal_for_type<cl_int>(rft);
@@ -295,10 +296,10 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft); error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>( error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
"test_non_uniform_all", non_uniform_all_source); "sub_group_non_uniform_all");
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>( error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
"test_elect", elect_source); "sub_group_elect");
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>( error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
"test_non_uniform_any", non_uniform_any_source); "sub_group_non_uniform_any");
return error; return error;
} }

View File

@@ -15,38 +15,19 @@
// //
#include "procs.h" #include "procs.h"
#include "subhelpers.h" #include "subhelpers.h"
#include "subgroup_common_kernels.h"
#include "subgroup_common_templates.h" #include "subgroup_common_templates.h"
#include "harness/typeWrappers.h" #include "harness/typeWrappers.h"
#include <bitset> #include <bitset>
namespace { namespace {
static const char* shuffle_xor_source =
"__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
"}\n";
static const char* shuffle_source =
"__kernel void test_sub_group_shuffle(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle(x, xy[gid].z);"
"}\n";
template <typename T> int run_shuffle_for_type(RunTestForType rft) template <typename T> int run_shuffle_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>( int error =
"test_sub_group_shuffle", shuffle_source); rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>("sub_group_shuffle");
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>( error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
"test_sub_group_shuffle_xor", shuffle_xor_source); "sub_group_shuffle_xor");
return error; return error;
} }
@@ -61,9 +42,11 @@ int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
"skipping test.\n"); "skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size); WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_generic_source);
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_shuffle_for_type<cl_int>(rft); int error = run_shuffle_for_type<cl_int>(rft);

View File

@@ -15,37 +15,19 @@
// //
#include "procs.h" #include "procs.h"
#include "subhelpers.h" #include "subhelpers.h"
#include "subgroup_common_kernels.h"
#include "subgroup_common_templates.h" #include "subgroup_common_templates.h"
#include "harness/conversions.h" #include "harness/conversions.h"
#include "harness/typeWrappers.h" #include "harness/typeWrappers.h"
namespace { namespace {
static const char* shuffle_down_source =
"__kernel void test_sub_group_shuffle_down(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
"}\n";
static const char* shuffle_up_source =
"__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
"}\n";
template <typename T> int run_shuffle_relative_for_type(RunTestForType rft) template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
{ {
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>( int error =
"test_sub_group_shuffle_up", shuffle_up_source); rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>("sub_group_shuffle_up");
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>( error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
"test_sub_group_shuffle_down", shuffle_down_source); "sub_group_shuffle_down");
return error; return error;
} }
@@ -62,9 +44,11 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device,
"device, skipping test.\n"); "device, skipping test.\n");
return TEST_SKIPPED_ITSELF; return TEST_SKIPPED_ITSELF;
} }
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size); WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_generic_source);
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_shuffle_relative_for_type<cl_int>(rft); int error = run_shuffle_relative_for_type<cl_int>(rft);