Extended subgroups - use 128bit masks (#1215)

* Extended subgroups - use 128bit masks

* Refactoring to avoid kernels code duplication

* unification kernel names as test_ prefix +subgroups function name
* use string literals that improve readability
* use kernel templates that limit code duplication
* WorkGroupParams allows define default kernel - kernel template for multiple functions
* WorkGroupParams allows define  kernel for specific one subgroup function

Co-authored-by: Stuart Brady <stuart.brady@arm.com>
This commit is contained in:
Grzegorz Wawiorko
2021-10-01 12:28:37 +02:00
committed by GitHub
parent 903f1bf65d
commit 92844bead1
12 changed files with 592 additions and 1054 deletions

View File

@@ -15,92 +15,20 @@
//
#include "subgroup_common_kernels.h"
const char* bcast_source =
"__kernel void test_bcast(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint which_sub_group_local_id = xy[gid].z;\n"
" out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
"}\n";
std::string sub_group_reduction_scan_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
out[gid] = %s(in[gid]);
}
)";
const char* redadd_source = "__kernel void test_redadd(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_add(in[gid]);\n"
"}\n";
const char* redmax_source = "__kernel void test_redmax(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_max(in[gid]);\n"
"}\n";
const char* redmin_source = "__kernel void test_redmin(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_min(in[gid]);\n"
"}\n";
const char* scinadd_source =
"__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
"}\n";
const char* scinmax_source =
"__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
"}\n";
const char* scinmin_source =
"__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
"}\n";
const char* scexadd_source =
"__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
"}\n";
const char* scexmax_source =
"__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
"}\n";
const char* scexmin_source =
"__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
"}\n";
std::string sub_group_generic_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
out[gid] = %s(x, xy[gid].z);
}
)";

View File

@@ -18,15 +18,7 @@
#include "subhelpers.h"
extern const char* bcast_source;
extern const char* redadd_source;
extern const char* redmax_source;
extern const char* redmin_source;
extern const char* scinadd_source;
extern const char* scinmax_source;
extern const char* scinmin_source;
extern const char* scexadd_source;
extern const char* scexmax_source;
extern const char* scexmin_source;
extern std::string sub_group_reduction_scan_source;
extern std::string sub_group_generic_source;
#endif

View File

@@ -17,13 +17,10 @@
#define SUBGROUPCOMMONTEMPLATES_H
#include "typeWrappers.h"
#include <bitset>
#include "CL/cl_half.h"
#include "subhelpers.h"
#include <set>
typedef std::bitset<128> bs128;
static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
const std::string &mask_type,
cl_uint max_sub_group_size)
@@ -577,16 +574,21 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
test_params.work_items_mask.any()
? func_name = "sub_group_non_uniform_scan_exclusive"
: func_name = "sub_group_scan_exclusive";
log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n",
test_params.global_workgroup_size, nw, ns, work_items_mask);
"size = %d \n",
test_params.global_workgroup_size, nw, ns);
if (test_params.work_items_mask.any())
{
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
}
genrand<Ty, operation>(x, t, m, ns, nw, ng);
}
@@ -597,18 +599,22 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
bs128 work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
test_params.work_items_mask.any()
? func_name = "sub_group_non_uniform_scan_exclusive"
: func_name = "sub_group_scan_exclusive";
uint32_t use_work_items_mask;
// for uniform case take into consideration all workitems
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
if (!work_items_mask.any())
{
work_items_mask.set();
}
for (k = 0; k < ng; ++k)
{ // for each work_group
// Map to array indexed to array indexed by local ID and sub group
@@ -624,8 +630,7 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
std::set<int> active_work_items;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (use_work_items_mask & check_work_item)
if (work_items_mask.test(i))
{
active_work_items.insert(i);
}
@@ -688,18 +693,23 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
test_params.work_items_mask.any()
? func_name = "sub_group_non_uniform_scan_inclusive"
: func_name = "sub_group_scan_inclusive";
genrand<Ty, operation>(x, t, m, ns, nw, ng);
log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n",
test_params.global_workgroup_size, nw, ns, work_items_mask);
"size = %d \n",
test_params.global_workgroup_size, nw, ns);
if (test_params.work_items_mask.any())
{
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
@@ -709,18 +719,22 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
bs128 work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
work_items_mask.any()
? func_name = "sub_group_non_uniform_scan_inclusive"
: func_name = "sub_group_scan_inclusive";
uint32_t use_work_items_mask;
// for uniform case take into consideration all workitems
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
if (!work_items_mask.any())
{
work_items_mask.set();
}
// std::bitset<32> mask32(use_work_items_mask);
// for (int k) mask32.count();
for (k = 0; k < ng; ++k)
@@ -740,8 +754,7 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (use_work_items_mask & check_work_item)
if (work_items_mask.test(i))
{
if (catch_frist_active == -1)
{
@@ -807,17 +820,22 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_reduce"
test_params.work_items_mask.any()
? func_name = "sub_group_non_uniform_reduce"
: func_name = "sub_group_reduce";
log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n",
test_params.global_workgroup_size, nw, ns, work_items_mask);
"size = %d \n",
test_params.global_workgroup_size, nw, ns);
if (test_params.work_items_mask.any())
{
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
}
genrand<Ty, operation>(x, t, m, ns, nw, ng);
}
@@ -828,13 +846,13 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
bs128 work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
ng = ng / nw;
Ty tr, rr;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_reduce"
work_items_mask.any() ? func_name = "sub_group_non_uniform_reduce"
: func_name = "sub_group_reduce";
for (k = 0; k < ng; ++k)
@@ -847,9 +865,10 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
my[j] = y[j];
}
uint32_t use_work_items_mask;
use_work_items_mask =
!work_items_mask ? 0xFFFFFFFF : work_items_mask;
if (!work_items_mask.any())
{
work_items_mask.set();
}
for (j = 0; j < nj; ++j)
{
@@ -859,8 +878,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
int catch_frist_active = -1;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (use_work_items_mask & check_work_item)
if (work_items_mask.test(i))
{
if (catch_frist_active == -1)
{

View File

@@ -24,31 +24,172 @@
#include <limits>
#include <vector>
#include <type_traits>
#include <bitset>
#include <regex>
#include <map>
#define NR_OF_ACTIVE_WORK_ITEMS 4
extern MTdata gMTdata;
typedef std::bitset<128> bs128;
extern cl_half_rounding_mode g_rounding_mode;
struct WorkGroupParams
{
WorkGroupParams(size_t gws, size_t lws,
const std::vector<uint32_t> &all_wim = {})
bool use_mask = false)
: global_workgroup_size(gws), local_workgroup_size(lws),
all_work_item_masks(all_wim)
use_masks(use_mask)
{
subgroup_size = 0;
work_items_mask = 0;
use_core_subgroups = true;
dynsc = 0;
load_masks();
}
size_t global_workgroup_size;
size_t local_workgroup_size;
size_t subgroup_size;
uint32_t work_items_mask;
bs128 work_items_mask;
int dynsc;
bool use_core_subgroups;
std::vector<uint32_t> all_work_item_masks;
std::vector<bs128> all_work_item_masks;
bool use_masks;
void save_kernel_source(const std::string &source, std::string name = "")
{
if (name == "")
{
name = "default";
}
if (kernel_function_name.find(name) != kernel_function_name.end())
{
log_info("Kernel definition duplication. Source will be "
"overwritten for function name %s",
name.c_str());
}
kernel_function_name[name] = source;
};
// return specific defined kernel or default.
std::string get_kernel_source(std::string name)
{
if (kernel_function_name.find(name) == kernel_function_name.end())
{
return kernel_function_name["default"];
}
return kernel_function_name[name];
}
private:
std::map<std::string, std::string> kernel_function_name;
void load_masks()
{
if (use_masks)
{
// 1 in string will be set 1, 0 will be set 0
bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000",
128, '0', '1');
all_work_item_masks.push_back(mask_0xf0f0f0f0);
// 1 in string will be set 0, 0 will be set 1
bs128 mask_0x0f0f0f0f("11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000"
"11110000111100001111000011110000",
128, '1', '0');
all_work_item_masks.push_back(mask_0x0f0f0f0f);
bs128 mask_0x5555aaaa("10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010",
128, '0', '1');
all_work_item_masks.push_back(mask_0x5555aaaa);
bs128 mask_0xaaaa5555("10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010"
"10101010101010101010101010101010",
128, '1', '0');
all_work_item_masks.push_back(mask_0xaaaa5555);
// 0x0f0ff0f0
bs128 mask_0x0f0ff0f0("00001111000011111111000011110000"
"00001111000011111111000011110000"
"00001111000011111111000011110000"
"00001111000011111111000011110000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x0f0ff0f0);
// 0xff0000ff
bs128 mask_0xff0000ff("11111111000000000000000011111111"
"11111111000000000000000011111111"
"11111111000000000000000011111111"
"11111111000000000000000011111111",
128, '0', '1');
all_work_item_masks.push_back(mask_0xff0000ff);
// 0xff00ff00
bs128 mask_0xff00ff00("11111111000000001111111100000000"
"11111111000000001111111100000000"
"11111111000000001111111100000000"
"11111111000000001111111100000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0xff00ff00);
// 0x00ffff00
bs128 mask_0x00ffff00("00000000111111111111111100000000"
"00000000111111111111111100000000"
"00000000111111111111111100000000"
"00000000111111111111111100000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x00ffff00);
// 0x80 1 workitem highest id for 8 subgroup size
bs128 mask_0x80808080("10000000100000001000000010000000"
"10000000100000001000000010000000"
"10000000100000001000000010000000"
"10000000100000001000000010000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x80808080);
// 0x8000 1 workitem highest id for 16 subgroup size
bs128 mask_0x80008000("10000000000000001000000000000000"
"10000000000000001000000000000000"
"10000000000000001000000000000000"
"10000000000000001000000000000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x80008000);
// 0x80000000 1 workitem highest id for 32 subgroup size
bs128 mask_0x80000000("10000000000000000000000000000000"
"10000000000000000000000000000000"
"10000000000000000000000000000000"
"10000000000000000000000000000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x80000000);
// 0x80000000 00000000 1 workitem highest id for 64 subgroup size
// 0x80000000 1 workitem highest id for 32 subgroup size
bs128 mask_0x8000000000000000("10000000000000000000000000000000"
"00000000000000000000000000000000"
"10000000000000000000000000000000"
"00000000000000000000000000000000",
128, '0', '1');
all_work_item_masks.push_back(mask_0x8000000000000000);
// 0x80000000 00000000 00000000 00000000 1 workitem highest id for
// 128 subgroup size
bs128 mask_0x80000000000000000000000000000000(
"10000000000000000000000000000000"
"00000000000000000000000000000000"
"00000000000000000000000000000000"
"00000000000000000000000000000000",
128, '0', '1');
all_work_item_masks.push_back(
mask_0x80000000000000000000000000000000);
bs128 mask_0xffffffff("11111111111111111111111111111111"
"11111111111111111111111111111111"
"11111111111111111111111111111111"
"11111111111111111111111111111111",
128, '0', '1');
all_work_item_masks.push_back(mask_0xffffffff);
}
}
};
enum class SubgroupsBroadcastOp
@@ -1267,11 +1408,23 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
std::vector<Ty> mapout;
mapout.resize(local);
std::stringstream kernel_sstr;
if (test_params.work_items_mask != 0)
if (test_params.use_masks)
{
kernel_sstr << "#define WORK_ITEMS_MASK ";
kernel_sstr << "0x" << std::hex << test_params.work_items_mask
<< "\n";
// Prapare uint4 type to store bitmask on kernel OpenCL C side
// To keep order the first characet in string is the lowest bit
// there was a need to give such offset to bitset constructor
// (first highest offset = 96)
std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(),
96, 32);
std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(),
64, 32);
std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(),
32, 32);
std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(),
0, 32);
kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b"
<< bits_1_32 << ",0b" << bits_33_64 << ",0b"
<< bits_65_96 << ",0b" << bits_97_128 << ");\n";
}
@@ -1452,18 +1605,24 @@ struct RunTestForType
num_elements_(num_elements), test_params_(test_params)
{}
template <typename T, typename U>
int run_impl(const char *kernel_name, const char *source)
int run_impl(const std::string &function_name)
{
int error = TEST_PASS;
std::string source =
std::regex_replace(test_params_.get_kernel_source(function_name),
std::regex("\\%s"), function_name);
std::string kernel_name = "test_" + function_name;
if (test_params_.all_work_item_masks.size() > 0)
{
error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
kernel_name, source, test_params_);
kernel_name.c_str(), source.c_str(),
test_params_);
}
else
{
error = test<T, U>::run(device_, context_, queue_, num_elements_,
kernel_name, source, test_params_);
kernel_name.c_str(), source.c_str(),
test_params_);
}
return error;

View File

@@ -150,25 +150,25 @@ template <typename T>
int run_broadcast_scan_reduction_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
"test_bcast", bcast_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
redadd_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
redmax_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
redmin_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
scinadd_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
scinmax_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
scinmin_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
scexadd_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
scexmax_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
scexmin_source);
"sub_group_broadcast");
error |=
rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
error |=
rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
error |=
rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
"sub_group_scan_inclusive_add");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
"sub_group_scan_inclusive_max");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
"sub_group_scan_inclusive_min");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
"sub_group_scan_exclusive_add");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
"sub_group_scan_exclusive_max");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
"sub_group_scan_exclusive_min");
return error;
}
@@ -181,11 +181,14 @@ int test_subgroup_functions(cl_device_id device, cl_context context,
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_reduction_scan_source);
test_params.save_kernel_source(sub_group_generic_source,
"sub_group_broadcast");
RunTestForType rft(device, context, queue, num_elements, test_params);
int error =
rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
error |=
rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("sub_group_any");
error |= rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("sub_group_all");
error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);

View File

@@ -684,239 +684,127 @@ template <typename Ty, BallotOp operation> struct SMASK
}
};
static const char *bcast_non_uniform_source =
"__kernel void test_bcast_non_uniform(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n"
" } else {\n"
" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n"
" }\n"
"}\n";
std::string sub_group_non_uniform_broadcast_source = R"(
__kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);
} else {
out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);
}
}
)";
std::string sub_group_broadcast_first_source = R"(
__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
out[gid] = sub_group_broadcast_first(x);;
} else {
out[gid] = sub_group_broadcast_first(x);;
}
}
)";
std::string sub_group_ballot_bit_scan_find_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
uint4 value = (uint4)(0,0,0,0);
value = (uint4)(%s(x),0,0,0);
out[gid] = value;
}
)";
std::string sub_group_ballot_mask_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
xy[gid].z = get_max_sub_group_size();
Type x = in[gid];
uint4 mask = %s();
out[gid] = mask;
}
)";
std::string sub_group_ballot_source = R"(
__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
uint4 full_ballot = sub_group_ballot(1);
uint divergence_mask;
uint4 partial_ballot;
uint gid = get_global_id(0);
XY(xy,gid);
if (get_sub_group_local_id() & 1) {
divergence_mask = 0xaaaaaaaa;
partial_ballot = sub_group_ballot(1);
} else {
divergence_mask = 0x55555555;
partial_ballot = sub_group_ballot(1);
}
size_t lws = get_local_size(0);
uint4 masked_ballot = full_ballot;
masked_ballot.x &= divergence_mask;
masked_ballot.y &= divergence_mask;
masked_ballot.z &= divergence_mask;
masked_ballot.w &= divergence_mask;
out[gid] = all(masked_ballot == partial_ballot);
static const char *bcast_first_source =
"__kernel void test_bcast_first(const __global Type *in, __global int4 "
"*xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
" out[gid] = sub_group_broadcast_first(x);\n"
" } else {\n"
" out[gid] = sub_group_broadcast_first(x);\n"
" }\n"
"}\n";
static const char *ballot_bit_count_source =
"__kernel void test_sub_group_ballot_bit_count(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);\n"
" value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_inclusive_scan_source =
"__kernel void test_sub_group_ballot_inclusive_scan(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);\n"
" value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_exclusive_scan_source =
"__kernel void test_sub_group_ballot_exclusive_scan(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);\n"
" value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_find_lsb_source =
"__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);\n"
" value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_find_msb_source =
"__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(0,0,0,0);"
" value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
" out[gid] = value ;"
"}\n";
static const char *get_subgroup_ge_mask_source =
"__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_ge_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_gt_mask_source =
"__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_gt_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_le_mask_source =
"__kernel void test_get_sub_group_le_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_le_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_lt_mask_source =
"__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_lt_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *get_subgroup_eq_mask_source =
"__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].z = get_max_sub_group_size();\n"
" Type x = in[gid];\n"
" uint4 mask = get_sub_group_eq_mask();"
" out[gid] = mask;\n"
"}\n";
static const char *ballot_source =
"__kernel void test_sub_group_ballot(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
"uint4 full_ballot = sub_group_ballot(1);\n"
"uint divergence_mask;\n"
"uint4 partial_ballot;\n"
"uint gid = get_global_id(0);"
"XY(xy,gid);\n"
"if (get_sub_group_local_id() & 1) {\n"
" divergence_mask = 0xaaaaaaaa;\n"
" partial_ballot = sub_group_ballot(1);\n"
"} else {\n"
" divergence_mask = 0x55555555;\n"
" partial_ballot = sub_group_ballot(1);\n"
"}\n"
" size_t lws = get_local_size(0);\n"
"uint4 masked_ballot = full_ballot;\n"
"masked_ballot.x &= divergence_mask;\n"
"masked_ballot.y &= divergence_mask;\n"
"masked_ballot.z &= divergence_mask;\n"
"masked_ballot.w &= divergence_mask;\n"
"out[gid] = all(masked_ballot == partial_ballot);\n"
"} \n";
static const char *ballot_source_inverse =
"__kernel void test_sub_group_ballot_inverse(const __global "
"Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint4 value = (uint4)(10,0,0,0);\n"
" if (get_sub_group_local_id() & 1) {"
" uint4 partial_ballot_mask = "
"(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);"
" if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
" value = (uint4)(1,0,0,1);\n"
" } else {\n"
" value = (uint4)(0,0,0,1);\n"
" }\n"
" } else {\n"
" uint4 partial_ballot_mask = "
"(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
" if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
" value = (uint4)(1,0,0,2);\n"
" } else {\n"
" value = (uint4)(0,0,0,2);\n"
" }\n"
" }\n"
" out[gid] = value;\n"
"}\n";
static const char *ballot_bit_extract_source =
"__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint index = xy[gid].z;\n"
" uint4 value = (uint4)(10,0,0,0);\n"
" if (get_sub_group_local_id() & 1) {"
" if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
" value = (uint4)(1,0,0,1);\n"
" } else {\n"
" value = (uint4)(0,0,0,1);\n"
" }\n"
" } else {\n"
" if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
" value = (uint4)(1,0,0,2);\n"
" } else {\n"
" value = (uint4)(0,0,0,2);\n"
" }\n"
" }\n"
" out[gid] = value;\n"
"}\n";
}
)";
std::string sub_group_inverse_ballot_source = R"(
__kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
uint4 value = (uint4)(10,0,0,0);
if (get_sub_group_local_id() & 1) {
uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);
if (sub_group_inverse_ballot(partial_ballot_mask)) {
value = (uint4)(1,0,0,1);
} else {
value = (uint4)(0,0,0,1);
}
} else {
uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555);
if (sub_group_inverse_ballot(partial_ballot_mask)) {
value = (uint4)(1,0,0,2);
} else {
value = (uint4)(0,0,0,2);
}
}
out[gid] = value;
}
)";
std::string sub_group_ballot_bit_extract_source = R"(
__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
Type x = in[gid];
uint index = xy[gid].z;
uint4 value = (uint4)(10,0,0,0);
if (get_sub_group_local_id() & 1) {
if (sub_group_ballot_bit_extract(x, xy[gid].z)) {
value = (uint4)(1,0,0,1);
} else {
value = (uint4)(0,0,0,1);
}
} else {
if (sub_group_ballot_bit_extract(x, xy[gid].w)) {
value = (uint4)(1,0,0,2);
} else {
value = (uint4)(0,0,0,2);
}
}
out[gid] = value;
}
)";
template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
{
int error =
rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
"test_bcast_non_uniform", bcast_non_uniform_source);
"sub_group_non_uniform_broadcast");
return error;
}
@@ -932,9 +820,15 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
"skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
constexpr size_t global_work_size = 170;
constexpr size_t local_work_size = 64;
WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_ballot_mask_source);
test_params.save_kernel_source(sub_group_non_uniform_broadcast_source,
"sub_group_non_uniform_broadcast");
test_params.save_kernel_source(sub_group_broadcast_first_source,
"sub_group_broadcast_first");
RunTestForType rft(device, context, queue, num_elements, test_params);
// non uniform broadcast functions
@@ -1018,76 +912,87 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
// broadcast first functions
error |=
rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_uint,
BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_long,
BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_ulong,
BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_short,
BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_ushort,
BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_char,
BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_uchar,
BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_float,
BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<cl_double,
BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
error |= rft.run_impl<
subgroups::cl_half,
BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
"test_bcast_first", bcast_first_source);
"sub_group_broadcast_first");
// mask functions
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
"test_get_sub_group_eq_mask", get_subgroup_eq_mask_source);
"get_sub_group_eq_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
"test_get_sub_group_ge_mask", get_subgroup_ge_mask_source);
"get_sub_group_ge_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
"test_get_sub_group_gt_mask", get_subgroup_gt_mask_source);
"get_sub_group_gt_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
"test_get_sub_group_le_mask", get_subgroup_le_mask_source);
"get_sub_group_le_mask");
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
"test_get_sub_group_lt_mask", get_subgroup_lt_mask_source);
"get_sub_group_lt_mask");
// ballot functions
error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot",
ballot_source);
error |= rft.run_impl<cl_uint4,
WorkGroupParams test_params_ballot(global_work_size, local_work_size);
test_params_ballot.save_kernel_source(
sub_group_ballot_bit_scan_find_source);
test_params_ballot.save_kernel_source(sub_group_ballot_source,
"sub_group_ballot");
test_params_ballot.save_kernel_source(sub_group_inverse_ballot_source,
"sub_group_inverse_ballot");
test_params_ballot.save_kernel_source(sub_group_ballot_bit_extract_source,
"sub_group_ballot_bit_extract");
RunTestForType rft_ballot(device, context, queue, num_elements,
test_params_ballot);
error |= rft_ballot.run_impl<cl_uint, BALLOT<cl_uint>>("sub_group_ballot");
error |=
rft_ballot.run_impl<cl_uint4,
BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
"test_sub_group_ballot_inverse", ballot_source_inverse);
error |= rft.run_impl<
"sub_group_inverse_ballot");
error |= rft_ballot.run_impl<
cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
"test_sub_group_ballot_bit_extract", ballot_bit_extract_source);
error |= rft.run_impl<
"sub_group_ballot_bit_extract");
error |= rft_ballot.run_impl<
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
"test_sub_group_ballot_bit_count", ballot_bit_count_source);
error |= rft.run_impl<
"sub_group_ballot_bit_count");
error |= rft_ballot.run_impl<
cl_uint4,
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
"test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source);
error |= rft.run_impl<
"sub_group_ballot_inclusive_scan");
error |= rft_ballot.run_impl<
cl_uint4,
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
"test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source);
error |= rft.run_impl<
"sub_group_ballot_exclusive_scan");
error |= rft_ballot.run_impl<
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
"test_sub_group_ballot_find_lsb", ballot_find_lsb_source);
error |= rft.run_impl<
"sub_group_ballot_find_lsb");
error |= rft_ballot.run_impl<
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
"test_sub_group_ballot_find_msb", ballot_find_msb_source);
"sub_group_ballot_find_msb");
return error;
}

View File

@@ -22,149 +22,17 @@
#define CLUSTER_SIZE_STR "4"
namespace {
static const char *redadd_clustered_source =
"__kernel void test_redadd_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmax_clustered_source =
"__kernel void test_redmax_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmin_clustered_source =
"__kernel void test_redmin_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmul_clustered_source =
"__kernel void test_redmul_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redand_clustered_source =
"__kernel void test_redand_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redor_clustered_source =
"__kernel void test_redor_clustered(const __global Type *in, __global int4 "
"*xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redxor_clustered_source =
"__kernel void test_redxor_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redand_clustered_logical_source =
"__kernel void test_redand_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
static const char *redor_clustered_logical_source =
"__kernel void test_redor_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
static const char *redxor_clustered_logical_source =
"__kernel void test_redxor_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if ( sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
std::string sub_group_clustered_reduce_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
xy[gid].w = 0;
if (sizeof(in[gid]) == sizeof(%s(in[gid], )" CLUSTER_SIZE_STR R"())) {
xy[gid].w = sizeof(in[gid]);
}
out[gid] = %s(in[gid], )" CLUSTER_SIZE_STR R"();
}
)";
// DESCRIPTION:
// Test for reduce cluster functions
@@ -267,34 +135,34 @@ template <typename T>
int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
"test_redadd_clustered", redadd_clustered_source);
"sub_group_clustered_reduce_add");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
"test_redmax_clustered", redmax_clustered_source);
"sub_group_clustered_reduce_max");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
"test_redmin_clustered", redmin_clustered_source);
"sub_group_clustered_reduce_min");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
"test_redmul_clustered", redmul_clustered_source);
"sub_group_clustered_reduce_mul");
return error;
}
template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
"test_redand_clustered", redand_clustered_source);
"sub_group_clustered_reduce_and");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
"test_redor_clustered", redor_clustered_source);
"sub_group_clustered_reduce_or");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
"test_redxor_clustered", redxor_clustered_source);
"sub_group_clustered_reduce_xor");
return error;
}
template <typename T>
int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
"test_redand_clustered_logical", redand_clustered_logical_source);
"sub_group_clustered_reduce_logical_and");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
"test_redor_clustered_logical", redor_clustered_logical_source);
"sub_group_clustered_reduce_logical_or");
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
"test_redxor_clustered_logical", redxor_clustered_logical_source);
"sub_group_clustered_reduce_logical_xor");
return error;
}
@@ -311,9 +179,11 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device,
"device, skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_clustered_reduce_source);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);

View File

@@ -24,30 +24,30 @@ namespace {
template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
{
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
"test_bcast", bcast_source);
"sub_group_broadcast");
return error;
}
template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
redadd_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
redmax_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
redmin_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
scinadd_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
scinmax_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
scinmin_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
scexadd_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
scexmax_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
scexmin_source);
int error =
rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
error |=
rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
error |=
rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
"sub_group_scan_inclusive_add");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
"sub_group_scan_inclusive_max");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
"sub_group_scan_inclusive_min");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
"sub_group_scan_exclusive_add");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
"sub_group_scan_exclusive_max");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
"sub_group_scan_exclusive_min");
return error;
}
@@ -65,11 +65,15 @@ int test_subgroup_functions_extended_types(cl_device_id device,
"device, skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
RunTestForType rft(device, context, queue, num_elements, test_params);
test_params.save_kernel_source(sub_group_reduction_scan_source);
test_params.save_kernel_source(sub_group_generic_source,
"sub_group_broadcast");
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_broadcast_for_extended_type<cl_uint2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
error |= run_broadcast_for_extended_type<cl_uint4>(rft);

View File

@@ -17,335 +17,28 @@
#include "subhelpers.h"
#include "harness/typeWrappers.h"
#include "subgroup_common_templates.h"
#include <cstdio>
namespace {
static const char *scinadd_non_uniform_source = R"(
__kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
std::string sub_group_non_uniform_arithmetic_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
uint subgroup_local_id = get_sub_group_local_id();
uint elect_work_item = 1 << (subgroup_local_id % 32);
uint work_item_mask;
if(subgroup_local_id < 32) {
work_item_mask = work_item_mask_vector.x;
} else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z;
}
}
)";
static const char *scinmax_non_uniform_source = R"(
__kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
}
}
)";
static const char *scinmin_non_uniform_source = R"(
__kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
}
}
)";
static const char *scinmul_non_uniform_source = R"(
__kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
}
}
)";
static const char *scinand_non_uniform_source = R"(
__kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
}
}
)";
static const char *scinor_non_uniform_source = R"(
__kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
}
}
)";
static const char *scinxor_non_uniform_source = R"(
__kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
}
}
)";
static const char *scinand_non_uniform_logical_source = R"(
__kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
}
}
)";
static const char *scinor_non_uniform_logical_source = R"(
__kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
}
}
)";
static const char *scinxor_non_uniform_logical_source = R"(
__kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
}
}
)";
static const char *scexadd_non_uniform_source = R"(
__kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
}
}
)";
static const char *scexmax_non_uniform_source = R"(
__kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
}
}
)";
static const char *scexmin_non_uniform_source = R"(
__kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
}
}
)";
static const char *scexmul_non_uniform_source = R"(
__kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
}
}
)";
static const char *scexand_non_uniform_source = R"(
__kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
}
}
)";
static const char *scexor_non_uniform_source = R"(
__kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
}
}
)";
static const char *scexxor_non_uniform_source = R"(
__kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
}
}
)";
static const char *scexand_non_uniform_logical_source = R"(
__kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
}
}
)";
static const char *scexor_non_uniform_logical_source = R"(
__kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
}
}
)";
static const char *scexxor_non_uniform_logical_source = R"(
__kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
}
}
)";
static const char *redadd_non_uniform_source = R"(
__kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
}
}
)";
static const char *redmax_non_uniform_source = R"(
__kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
}
}
)";
static const char *redmin_non_uniform_source = R"(
__kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
}
}
)";
static const char *redmul_non_uniform_source = R"(
__kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
}
}
)";
static const char *redand_non_uniform_source = R"(
__kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
}
}
)";
static const char *redor_non_uniform_source = R"(
__kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
}
}
)";
static const char *redxor_non_uniform_source = R"(
__kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
}
}
)";
static const char *redand_non_uniform_logical_source = R"(
__kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
}
}
)";
static const char *redor_non_uniform_logical_source = R"(
__kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
}
}
)";
static const char *redxor_non_uniform_logical_source = R"(
__kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
if (elect_work_item & work_item_mask){
out[gid] = %s(in[gid]);
}
}
)";
@@ -354,52 +47,52 @@ template <typename T>
int run_functions_add_mul_max_min_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
"test_scinadd_non_uniform", scinadd_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_add");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
"test_scinmul_non_uniform", scinmul_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_mul");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
"test_scinmax_non_uniform", scinmax_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_max");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
"test_scinmin_non_uniform", scinmin_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_min");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
"test_scexadd_non_uniform", scexadd_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_add");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
"test_scexmul_non_uniform", scexmul_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_mul");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
"test_scexmax_non_uniform", scexmax_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_max");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
"test_scexmin_non_uniform", scexmin_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_min");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
"test_redadd_non_uniform", redadd_non_uniform_source);
"sub_group_non_uniform_reduce_add");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
"test_redmul_non_uniform", redmul_non_uniform_source);
"sub_group_non_uniform_reduce_mul");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
"test_redmax_non_uniform", redmax_non_uniform_source);
"sub_group_non_uniform_reduce_max");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
"test_redmin_non_uniform", redmin_non_uniform_source);
"sub_group_non_uniform_reduce_min");
return error;
}
template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
"test_scinand_non_uniform", scinand_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_and");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
"test_scinor_non_uniform", scinor_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_or");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
"test_scinxor_non_uniform", scinxor_non_uniform_source);
"sub_group_non_uniform_scan_inclusive_xor");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
"test_scexand_non_uniform", scexand_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_and");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
"test_scexor_non_uniform", scexor_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_or");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
"test_scexxor_non_uniform", scexxor_non_uniform_source);
"sub_group_non_uniform_scan_exclusive_xor");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
"test_redand_non_uniform", redand_non_uniform_source);
"sub_group_non_uniform_reduce_and");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
"test_redor_non_uniform", redor_non_uniform_source);
"sub_group_non_uniform_reduce_or");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
"test_redxor_non_uniform", redxor_non_uniform_source);
"sub_group_non_uniform_reduce_xor");
return error;
}
@@ -407,23 +100,23 @@ template <typename T>
int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
"test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
"sub_group_non_uniform_scan_inclusive_logical_and");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
"test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
"sub_group_non_uniform_scan_inclusive_logical_or");
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
"test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
"sub_group_non_uniform_scan_inclusive_logical_xor");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
"test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
"sub_group_non_uniform_scan_exclusive_logical_and");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
"test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
"sub_group_non_uniform_scan_exclusive_logical_or");
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
"test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
"sub_group_non_uniform_scan_exclusive_logical_xor");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
"test_redand_non_uniform_logical", redand_non_uniform_logical_source);
"sub_group_non_uniform_reduce_logical_and");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
"test_redor_non_uniform_logical", redor_non_uniform_logical_source);
"sub_group_non_uniform_reduce_logical_or");
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
"test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
"sub_group_non_uniform_reduce_logical_xor");
return error;
}
@@ -441,13 +134,11 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
"this device, skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
0x00ffff00, 0x80000000, 0xaaaaaaaa };
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size, masks);
WorkGroupParams test_params(global_work_size, local_work_size, true);
test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);

View File

@@ -28,7 +28,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
int non_uniform_size = ng % nw;
ng = ng / nw;
@@ -40,9 +39,11 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
operation_names(operation));
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x data type (%s)\n",
test_params.global_workgroup_size, nw, ns, work_items_mask,
"size = %d data type (%s)\n",
test_params.global_workgroup_size, nw, ns,
TypeManager<T>::name());
log_info(" work items mask: %s\n",
test_params.work_items_mask.to_string().c_str());
if (non_uniform_size)
{
log_info(" non uniform work group size mode ON\n");
@@ -99,7 +100,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
cl_int tr, rr;
int non_uniform_size = ng % nw;
@@ -141,8 +141,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
std::set<int> active_work_items;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (work_items_mask & check_work_item)
if (test_params.work_items_mask.test(i))
{
active_work_items.insert(i);
switch (operation)
@@ -215,46 +214,47 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
return TEST_PASS;
}
};
static const char *elect_source = R"(
__kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
std::string sub_group_elect_source = R"(
__kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
uint subgroup_local_id = get_sub_group_local_id();
uint elect_work_item = 1 << (subgroup_local_id % 32);
uint work_item_mask;
if(subgroup_local_id < 32) {
work_item_mask = work_item_mask_vector.x;
} else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z;
}
if (elect_work_item & work_item_mask){
out[gid] = sub_group_elect();
}
}
)";
static const char *non_uniform_any_source = R"(
__kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
std::string sub_group_non_uniform_any_all_all_equal_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_any(in[gid]);
uint subgroup_local_id = get_sub_group_local_id();
uint elect_work_item = 1 << (subgroup_local_id % 32);
uint work_item_mask;
if(subgroup_local_id < 32) {
work_item_mask = work_item_mask_vector.x;
} else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z;
}
}
)";
static const char *non_uniform_all_source = R"(
__kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_all(in[gid]);
}
}
)";
static const char *non_uniform_all_equal_source = R"(
__kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_all_equal(in[gid]);
if (elect_work_item & work_item_mask){
out[gid] = %s(in[gid]);
}
}
)";
@@ -262,7 +262,7 @@ static const char *non_uniform_all_equal_source = R"(
template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
"test_non_uniform_all_equal", non_uniform_all_equal_source);
"sub_group_non_uniform_all_equal");
return error;
}
}
@@ -278,12 +278,13 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
"device, skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
0x00ffff00, 0x80000000 };
constexpr size_t global_work_size = 170;
constexpr size_t local_work_size = 64;
WorkGroupParams test_params(global_work_size, local_work_size, masks);
WorkGroupParams test_params(global_work_size, local_work_size, true);
test_params.save_kernel_source(
sub_group_non_uniform_any_all_all_equal_source);
test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_vote_all_equal_for_type<cl_int>(rft);
@@ -295,10 +296,10 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
"test_non_uniform_all", non_uniform_all_source);
"sub_group_non_uniform_all");
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
"test_elect", elect_source);
"sub_group_elect");
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
"test_non_uniform_any", non_uniform_any_source);
"sub_group_non_uniform_any");
return error;
}

View File

@@ -15,38 +15,19 @@
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_kernels.h"
#include "subgroup_common_templates.h"
#include "harness/typeWrappers.h"
#include <bitset>
namespace {
static const char* shuffle_xor_source =
"__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
"}\n";
static const char* shuffle_source =
"__kernel void test_sub_group_shuffle(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle(x, xy[gid].z);"
"}\n";
template <typename T> int run_shuffle_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
"test_sub_group_shuffle", shuffle_source);
int error =
rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>("sub_group_shuffle");
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
"test_sub_group_shuffle_xor", shuffle_xor_source);
"sub_group_shuffle_xor");
return error;
}
@@ -61,9 +42,11 @@ int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
"skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_generic_source);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_shuffle_for_type<cl_int>(rft);

View File

@@ -15,37 +15,19 @@
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_kernels.h"
#include "subgroup_common_templates.h"
#include "harness/conversions.h"
#include "harness/typeWrappers.h"
namespace {
static const char* shuffle_down_source =
"__kernel void test_sub_group_shuffle_down(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
"}\n";
static const char* shuffle_up_source =
"__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
"}\n";
template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
"test_sub_group_shuffle_up", shuffle_up_source);
int error =
rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>("sub_group_shuffle_up");
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
"test_sub_group_shuffle_down", shuffle_down_source);
"sub_group_shuffle_down");
return error;
}
@@ -62,9 +44,11 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device,
"device, skipping test.\n");
return TEST_SKIPPED_ITSELF;
}
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
test_params.save_kernel_source(sub_group_generic_source);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_shuffle_relative_for_type<cl_int>(rft);