mirror of
https://github.com/KhronosGroup/OpenCL-CTS.git
synced 2026-03-19 06:09:01 +00:00
Extended subgroups - use 128bit masks (#1215)
* Extended subgroups - use 128bit masks * Refactoring to avoid kernels code duplication * unification kernel names as test_ prefix +subgroups function name * use string literals that improve readability * use kernel templates that limit code duplication * WorkGroupParams allows define default kernel - kernel template for multiple functions * WorkGroupParams allows define kernel for specific one subgroup function Co-authored-by: Stuart Brady <stuart.brady@arm.com>
This commit is contained in:
committed by
GitHub
parent
903f1bf65d
commit
92844bead1
@@ -15,92 +15,20 @@
|
|||||||
//
|
//
|
||||||
#include "subgroup_common_kernels.h"
|
#include "subgroup_common_kernels.h"
|
||||||
|
|
||||||
const char* bcast_source =
|
|
||||||
"__kernel void test_bcast(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint which_sub_group_local_id = xy[gid].z;\n"
|
|
||||||
" out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
|
|
||||||
|
|
||||||
"}\n";
|
std::string sub_group_reduction_scan_source = R"(
|
||||||
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
|
int gid = get_global_id(0);
|
||||||
|
XY(xy,gid);
|
||||||
|
out[gid] = %s(in[gid]);
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
|
||||||
const char* redadd_source = "__kernel void test_redadd(const __global Type "
|
std::string sub_group_generic_source = R"(
|
||||||
"*in, __global int4 *xy, __global Type *out)\n"
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
"{\n"
|
int gid = get_global_id(0);
|
||||||
" int gid = get_global_id(0);\n"
|
XY(xy,gid);
|
||||||
" XY(xy,gid);\n"
|
Type x = in[gid];
|
||||||
" out[gid] = sub_group_reduce_add(in[gid]);\n"
|
out[gid] = %s(x, xy[gid].z);
|
||||||
"}\n";
|
}
|
||||||
|
)";
|
||||||
const char* redmax_source = "__kernel void test_redmax(const __global Type "
|
|
||||||
"*in, __global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_reduce_max(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* redmin_source = "__kernel void test_redmin(const __global Type "
|
|
||||||
"*in, __global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_reduce_min(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* scinadd_source =
|
|
||||||
"__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
|
|
||||||
"__global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* scinmax_source =
|
|
||||||
"__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
|
|
||||||
"__global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* scinmin_source =
|
|
||||||
"__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
|
|
||||||
"__global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* scexadd_source =
|
|
||||||
"__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
|
|
||||||
"__global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* scexmax_source =
|
|
||||||
"__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
|
|
||||||
"__global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
const char* scexmin_source =
|
|
||||||
"__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
|
|
||||||
"__global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
|
|
||||||
"}\n";
|
|
||||||
@@ -18,15 +18,7 @@
|
|||||||
#include "subhelpers.h"
|
#include "subhelpers.h"
|
||||||
|
|
||||||
|
|
||||||
extern const char* bcast_source;
|
extern std::string sub_group_reduction_scan_source;
|
||||||
extern const char* redadd_source;
|
extern std::string sub_group_generic_source;
|
||||||
extern const char* redmax_source;
|
|
||||||
extern const char* redmin_source;
|
|
||||||
extern const char* scinadd_source;
|
|
||||||
extern const char* scinmax_source;
|
|
||||||
extern const char* scinmin_source;
|
|
||||||
extern const char* scexadd_source;
|
|
||||||
extern const char* scexmax_source;
|
|
||||||
extern const char* scexmin_source;
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -17,13 +17,10 @@
|
|||||||
#define SUBGROUPCOMMONTEMPLATES_H
|
#define SUBGROUPCOMMONTEMPLATES_H
|
||||||
|
|
||||||
#include "typeWrappers.h"
|
#include "typeWrappers.h"
|
||||||
#include <bitset>
|
|
||||||
#include "CL/cl_half.h"
|
#include "CL/cl_half.h"
|
||||||
#include "subhelpers.h"
|
#include "subhelpers.h"
|
||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
|
|
||||||
typedef std::bitset<128> bs128;
|
|
||||||
static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
|
static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
|
||||||
const std::string &mask_type,
|
const std::string &mask_type,
|
||||||
cl_uint max_sub_group_size)
|
cl_uint max_sub_group_size)
|
||||||
@@ -577,16 +574,21 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
std::string func_name;
|
std::string func_name;
|
||||||
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
|
test_params.work_items_mask.any()
|
||||||
: func_name = "sub_group_scan_exclusive";
|
? func_name = "sub_group_non_uniform_scan_exclusive"
|
||||||
|
: func_name = "sub_group_scan_exclusive";
|
||||||
log_info(" %s_%s(%s)...\n", func_name.c_str(),
|
log_info(" %s_%s(%s)...\n", func_name.c_str(),
|
||||||
operation_names(operation), TypeManager<Ty>::name());
|
operation_names(operation), TypeManager<Ty>::name());
|
||||||
log_info(" test params: global size = %d local size = %d subgroups "
|
log_info(" test params: global size = %d local size = %d subgroups "
|
||||||
"size = %d work item mask = 0x%x \n",
|
"size = %d \n",
|
||||||
test_params.global_workgroup_size, nw, ns, work_items_mask);
|
test_params.global_workgroup_size, nw, ns);
|
||||||
|
if (test_params.work_items_mask.any())
|
||||||
|
{
|
||||||
|
log_info(" work items mask: %s\n",
|
||||||
|
test_params.work_items_mask.to_string().c_str());
|
||||||
|
}
|
||||||
genrand<Ty, operation>(x, t, m, ns, nw, ng);
|
genrand<Ty, operation>(x, t, m, ns, nw, ng);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -597,18 +599,22 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
bs128 work_items_mask = test_params.work_items_mask;
|
||||||
int nj = (nw + ns - 1) / ns;
|
int nj = (nw + ns - 1) / ns;
|
||||||
Ty tr, rr;
|
Ty tr, rr;
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
|
|
||||||
std::string func_name;
|
std::string func_name;
|
||||||
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
|
test_params.work_items_mask.any()
|
||||||
: func_name = "sub_group_scan_exclusive";
|
? func_name = "sub_group_non_uniform_scan_exclusive"
|
||||||
|
: func_name = "sub_group_scan_exclusive";
|
||||||
|
|
||||||
|
|
||||||
uint32_t use_work_items_mask;
|
|
||||||
// for uniform case take into consideration all workitems
|
// for uniform case take into consideration all workitems
|
||||||
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
|
if (!work_items_mask.any())
|
||||||
|
{
|
||||||
|
work_items_mask.set();
|
||||||
|
}
|
||||||
for (k = 0; k < ng; ++k)
|
for (k = 0; k < ng; ++k)
|
||||||
{ // for each work_group
|
{ // for each work_group
|
||||||
// Map to array indexed to array indexed by local ID and sub group
|
// Map to array indexed to array indexed by local ID and sub group
|
||||||
@@ -624,8 +630,7 @@ template <typename Ty, ArithmeticOp operation> struct SCEX_NU
|
|||||||
std::set<int> active_work_items;
|
std::set<int> active_work_items;
|
||||||
for (i = 0; i < n; ++i)
|
for (i = 0; i < n; ++i)
|
||||||
{
|
{
|
||||||
uint32_t check_work_item = 1 << (i % 32);
|
if (work_items_mask.test(i))
|
||||||
if (use_work_items_mask & check_work_item)
|
|
||||||
{
|
{
|
||||||
active_work_items.insert(i);
|
active_work_items.insert(i);
|
||||||
}
|
}
|
||||||
@@ -688,18 +693,23 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
std::string func_name;
|
std::string func_name;
|
||||||
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
|
test_params.work_items_mask.any()
|
||||||
: func_name = "sub_group_scan_inclusive";
|
? func_name = "sub_group_non_uniform_scan_inclusive"
|
||||||
|
: func_name = "sub_group_scan_inclusive";
|
||||||
|
|
||||||
genrand<Ty, operation>(x, t, m, ns, nw, ng);
|
genrand<Ty, operation>(x, t, m, ns, nw, ng);
|
||||||
log_info(" %s_%s(%s)...\n", func_name.c_str(),
|
log_info(" %s_%s(%s)...\n", func_name.c_str(),
|
||||||
operation_names(operation), TypeManager<Ty>::name());
|
operation_names(operation), TypeManager<Ty>::name());
|
||||||
log_info(" test params: global size = %d local size = %d subgroups "
|
log_info(" test params: global size = %d local size = %d subgroups "
|
||||||
"size = %d work item mask = 0x%x \n",
|
"size = %d \n",
|
||||||
test_params.global_workgroup_size, nw, ns, work_items_mask);
|
test_params.global_workgroup_size, nw, ns);
|
||||||
|
if (test_params.work_items_mask.any())
|
||||||
|
{
|
||||||
|
log_info(" work items mask: %s\n",
|
||||||
|
test_params.work_items_mask.to_string().c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
|
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
|
||||||
@@ -709,18 +719,22 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
bs128 work_items_mask = test_params.work_items_mask;
|
||||||
|
|
||||||
int nj = (nw + ns - 1) / ns;
|
int nj = (nw + ns - 1) / ns;
|
||||||
Ty tr, rr;
|
Ty tr, rr;
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
|
|
||||||
std::string func_name;
|
std::string func_name;
|
||||||
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
|
work_items_mask.any()
|
||||||
: func_name = "sub_group_scan_inclusive";
|
? func_name = "sub_group_non_uniform_scan_inclusive"
|
||||||
|
: func_name = "sub_group_scan_inclusive";
|
||||||
|
|
||||||
uint32_t use_work_items_mask;
|
|
||||||
// for uniform case take into consideration all workitems
|
// for uniform case take into consideration all workitems
|
||||||
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
|
if (!work_items_mask.any())
|
||||||
|
{
|
||||||
|
work_items_mask.set();
|
||||||
|
}
|
||||||
// std::bitset<32> mask32(use_work_items_mask);
|
// std::bitset<32> mask32(use_work_items_mask);
|
||||||
// for (int k) mask32.count();
|
// for (int k) mask32.count();
|
||||||
for (k = 0; k < ng; ++k)
|
for (k = 0; k < ng; ++k)
|
||||||
@@ -740,8 +754,7 @@ template <typename Ty, ArithmeticOp operation> struct SCIN_NU
|
|||||||
|
|
||||||
for (i = 0; i < n; ++i)
|
for (i = 0; i < n; ++i)
|
||||||
{
|
{
|
||||||
uint32_t check_work_item = 1 << (i % 32);
|
if (work_items_mask.test(i))
|
||||||
if (use_work_items_mask & check_work_item)
|
|
||||||
{
|
{
|
||||||
if (catch_frist_active == -1)
|
if (catch_frist_active == -1)
|
||||||
{
|
{
|
||||||
@@ -807,17 +820,22 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
std::string func_name;
|
std::string func_name;
|
||||||
|
|
||||||
work_items_mask ? func_name = "sub_group_non_uniform_reduce"
|
test_params.work_items_mask.any()
|
||||||
: func_name = "sub_group_reduce";
|
? func_name = "sub_group_non_uniform_reduce"
|
||||||
|
: func_name = "sub_group_reduce";
|
||||||
log_info(" %s_%s(%s)...\n", func_name.c_str(),
|
log_info(" %s_%s(%s)...\n", func_name.c_str(),
|
||||||
operation_names(operation), TypeManager<Ty>::name());
|
operation_names(operation), TypeManager<Ty>::name());
|
||||||
log_info(" test params: global size = %d local size = %d subgroups "
|
log_info(" test params: global size = %d local size = %d subgroups "
|
||||||
"size = %d work item mask = 0x%x \n",
|
"size = %d \n",
|
||||||
test_params.global_workgroup_size, nw, ns, work_items_mask);
|
test_params.global_workgroup_size, nw, ns);
|
||||||
|
if (test_params.work_items_mask.any())
|
||||||
|
{
|
||||||
|
log_info(" work items mask: %s\n",
|
||||||
|
test_params.work_items_mask.to_string().c_str());
|
||||||
|
}
|
||||||
genrand<Ty, operation>(x, t, m, ns, nw, ng);
|
genrand<Ty, operation>(x, t, m, ns, nw, ng);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -828,14 +846,14 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
bs128 work_items_mask = test_params.work_items_mask;
|
||||||
int nj = (nw + ns - 1) / ns;
|
int nj = (nw + ns - 1) / ns;
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
Ty tr, rr;
|
Ty tr, rr;
|
||||||
|
|
||||||
std::string func_name;
|
std::string func_name;
|
||||||
work_items_mask ? func_name = "sub_group_non_uniform_reduce"
|
work_items_mask.any() ? func_name = "sub_group_non_uniform_reduce"
|
||||||
: func_name = "sub_group_reduce";
|
: func_name = "sub_group_reduce";
|
||||||
|
|
||||||
for (k = 0; k < ng; ++k)
|
for (k = 0; k < ng; ++k)
|
||||||
{
|
{
|
||||||
@@ -847,9 +865,10 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
|
|||||||
my[j] = y[j];
|
my[j] = y[j];
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t use_work_items_mask;
|
if (!work_items_mask.any())
|
||||||
use_work_items_mask =
|
{
|
||||||
!work_items_mask ? 0xFFFFFFFF : work_items_mask;
|
work_items_mask.set();
|
||||||
|
}
|
||||||
|
|
||||||
for (j = 0; j < nj; ++j)
|
for (j = 0; j < nj; ++j)
|
||||||
{
|
{
|
||||||
@@ -859,8 +878,7 @@ template <typename Ty, ArithmeticOp operation> struct RED_NU
|
|||||||
int catch_frist_active = -1;
|
int catch_frist_active = -1;
|
||||||
for (i = 0; i < n; ++i)
|
for (i = 0; i < n; ++i)
|
||||||
{
|
{
|
||||||
uint32_t check_work_item = 1 << (i % 32);
|
if (work_items_mask.test(i))
|
||||||
if (use_work_items_mask & check_work_item)
|
|
||||||
{
|
{
|
||||||
if (catch_frist_active == -1)
|
if (catch_frist_active == -1)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -24,31 +24,172 @@
|
|||||||
#include <limits>
|
#include <limits>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
#include <bitset>
|
||||||
|
#include <regex>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
#define NR_OF_ACTIVE_WORK_ITEMS 4
|
#define NR_OF_ACTIVE_WORK_ITEMS 4
|
||||||
|
|
||||||
extern MTdata gMTdata;
|
extern MTdata gMTdata;
|
||||||
|
typedef std::bitset<128> bs128;
|
||||||
extern cl_half_rounding_mode g_rounding_mode;
|
extern cl_half_rounding_mode g_rounding_mode;
|
||||||
|
|
||||||
struct WorkGroupParams
|
struct WorkGroupParams
|
||||||
{
|
{
|
||||||
WorkGroupParams(size_t gws, size_t lws,
|
WorkGroupParams(size_t gws, size_t lws,
|
||||||
const std::vector<uint32_t> &all_wim = {})
|
bool use_mask = false)
|
||||||
: global_workgroup_size(gws), local_workgroup_size(lws),
|
: global_workgroup_size(gws), local_workgroup_size(lws),
|
||||||
all_work_item_masks(all_wim)
|
use_masks(use_mask)
|
||||||
{
|
{
|
||||||
subgroup_size = 0;
|
subgroup_size = 0;
|
||||||
work_items_mask = 0;
|
work_items_mask = 0;
|
||||||
use_core_subgroups = true;
|
use_core_subgroups = true;
|
||||||
dynsc = 0;
|
dynsc = 0;
|
||||||
|
load_masks();
|
||||||
}
|
}
|
||||||
size_t global_workgroup_size;
|
size_t global_workgroup_size;
|
||||||
size_t local_workgroup_size;
|
size_t local_workgroup_size;
|
||||||
size_t subgroup_size;
|
size_t subgroup_size;
|
||||||
uint32_t work_items_mask;
|
bs128 work_items_mask;
|
||||||
int dynsc;
|
int dynsc;
|
||||||
bool use_core_subgroups;
|
bool use_core_subgroups;
|
||||||
std::vector<uint32_t> all_work_item_masks;
|
std::vector<bs128> all_work_item_masks;
|
||||||
|
bool use_masks;
|
||||||
|
void save_kernel_source(const std::string &source, std::string name = "")
|
||||||
|
{
|
||||||
|
if (name == "")
|
||||||
|
{
|
||||||
|
name = "default";
|
||||||
|
}
|
||||||
|
if (kernel_function_name.find(name) != kernel_function_name.end())
|
||||||
|
{
|
||||||
|
log_info("Kernel definition duplication. Source will be "
|
||||||
|
"overwritten for function name %s",
|
||||||
|
name.c_str());
|
||||||
|
}
|
||||||
|
kernel_function_name[name] = source;
|
||||||
|
};
|
||||||
|
// return specific defined kernel or default.
|
||||||
|
std::string get_kernel_source(std::string name)
|
||||||
|
{
|
||||||
|
if (kernel_function_name.find(name) == kernel_function_name.end())
|
||||||
|
{
|
||||||
|
return kernel_function_name["default"];
|
||||||
|
}
|
||||||
|
return kernel_function_name[name];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::map<std::string, std::string> kernel_function_name;
|
||||||
|
void load_masks()
|
||||||
|
{
|
||||||
|
if (use_masks)
|
||||||
|
{
|
||||||
|
// 1 in string will be set 1, 0 will be set 0
|
||||||
|
bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
|
||||||
|
"11110000111100001111000011110000"
|
||||||
|
"11110000111100001111000011110000"
|
||||||
|
"11110000111100001111000011110000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0xf0f0f0f0);
|
||||||
|
// 1 in string will be set 0, 0 will be set 1
|
||||||
|
bs128 mask_0x0f0f0f0f("11110000111100001111000011110000"
|
||||||
|
"11110000111100001111000011110000"
|
||||||
|
"11110000111100001111000011110000"
|
||||||
|
"11110000111100001111000011110000",
|
||||||
|
128, '1', '0');
|
||||||
|
all_work_item_masks.push_back(mask_0x0f0f0f0f);
|
||||||
|
bs128 mask_0x5555aaaa("10101010101010101010101010101010"
|
||||||
|
"10101010101010101010101010101010"
|
||||||
|
"10101010101010101010101010101010"
|
||||||
|
"10101010101010101010101010101010",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0x5555aaaa);
|
||||||
|
bs128 mask_0xaaaa5555("10101010101010101010101010101010"
|
||||||
|
"10101010101010101010101010101010"
|
||||||
|
"10101010101010101010101010101010"
|
||||||
|
"10101010101010101010101010101010",
|
||||||
|
128, '1', '0');
|
||||||
|
all_work_item_masks.push_back(mask_0xaaaa5555);
|
||||||
|
// 0x0f0ff0f0
|
||||||
|
bs128 mask_0x0f0ff0f0("00001111000011111111000011110000"
|
||||||
|
"00001111000011111111000011110000"
|
||||||
|
"00001111000011111111000011110000"
|
||||||
|
"00001111000011111111000011110000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0x0f0ff0f0);
|
||||||
|
// 0xff0000ff
|
||||||
|
bs128 mask_0xff0000ff("11111111000000000000000011111111"
|
||||||
|
"11111111000000000000000011111111"
|
||||||
|
"11111111000000000000000011111111"
|
||||||
|
"11111111000000000000000011111111",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0xff0000ff);
|
||||||
|
// 0xff00ff00
|
||||||
|
bs128 mask_0xff00ff00("11111111000000001111111100000000"
|
||||||
|
"11111111000000001111111100000000"
|
||||||
|
"11111111000000001111111100000000"
|
||||||
|
"11111111000000001111111100000000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0xff00ff00);
|
||||||
|
// 0x00ffff00
|
||||||
|
bs128 mask_0x00ffff00("00000000111111111111111100000000"
|
||||||
|
"00000000111111111111111100000000"
|
||||||
|
"00000000111111111111111100000000"
|
||||||
|
"00000000111111111111111100000000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0x00ffff00);
|
||||||
|
// 0x80 1 workitem highest id for 8 subgroup size
|
||||||
|
bs128 mask_0x80808080("10000000100000001000000010000000"
|
||||||
|
"10000000100000001000000010000000"
|
||||||
|
"10000000100000001000000010000000"
|
||||||
|
"10000000100000001000000010000000",
|
||||||
|
128, '0', '1');
|
||||||
|
|
||||||
|
all_work_item_masks.push_back(mask_0x80808080);
|
||||||
|
// 0x8000 1 workitem highest id for 16 subgroup size
|
||||||
|
bs128 mask_0x80008000("10000000000000001000000000000000"
|
||||||
|
"10000000000000001000000000000000"
|
||||||
|
"10000000000000001000000000000000"
|
||||||
|
"10000000000000001000000000000000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0x80008000);
|
||||||
|
// 0x80000000 1 workitem highest id for 32 subgroup size
|
||||||
|
bs128 mask_0x80000000("10000000000000000000000000000000"
|
||||||
|
"10000000000000000000000000000000"
|
||||||
|
"10000000000000000000000000000000"
|
||||||
|
"10000000000000000000000000000000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0x80000000);
|
||||||
|
// 0x80000000 00000000 1 workitem highest id for 64 subgroup size
|
||||||
|
// 0x80000000 1 workitem highest id for 32 subgroup size
|
||||||
|
bs128 mask_0x8000000000000000("10000000000000000000000000000000"
|
||||||
|
"00000000000000000000000000000000"
|
||||||
|
"10000000000000000000000000000000"
|
||||||
|
"00000000000000000000000000000000",
|
||||||
|
128, '0', '1');
|
||||||
|
|
||||||
|
all_work_item_masks.push_back(mask_0x8000000000000000);
|
||||||
|
// 0x80000000 00000000 00000000 00000000 1 workitem highest id for
|
||||||
|
// 128 subgroup size
|
||||||
|
bs128 mask_0x80000000000000000000000000000000(
|
||||||
|
"10000000000000000000000000000000"
|
||||||
|
"00000000000000000000000000000000"
|
||||||
|
"00000000000000000000000000000000"
|
||||||
|
"00000000000000000000000000000000",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(
|
||||||
|
mask_0x80000000000000000000000000000000);
|
||||||
|
|
||||||
|
bs128 mask_0xffffffff("11111111111111111111111111111111"
|
||||||
|
"11111111111111111111111111111111"
|
||||||
|
"11111111111111111111111111111111"
|
||||||
|
"11111111111111111111111111111111",
|
||||||
|
128, '0', '1');
|
||||||
|
all_work_item_masks.push_back(mask_0xffffffff);
|
||||||
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
enum class SubgroupsBroadcastOp
|
enum class SubgroupsBroadcastOp
|
||||||
@@ -1267,11 +1408,23 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
|
|||||||
std::vector<Ty> mapout;
|
std::vector<Ty> mapout;
|
||||||
mapout.resize(local);
|
mapout.resize(local);
|
||||||
std::stringstream kernel_sstr;
|
std::stringstream kernel_sstr;
|
||||||
if (test_params.work_items_mask != 0)
|
if (test_params.use_masks)
|
||||||
{
|
{
|
||||||
kernel_sstr << "#define WORK_ITEMS_MASK ";
|
// Prapare uint4 type to store bitmask on kernel OpenCL C side
|
||||||
kernel_sstr << "0x" << std::hex << test_params.work_items_mask
|
// To keep order the first characet in string is the lowest bit
|
||||||
<< "\n";
|
// there was a need to give such offset to bitset constructor
|
||||||
|
// (first highest offset = 96)
|
||||||
|
std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(),
|
||||||
|
96, 32);
|
||||||
|
std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(),
|
||||||
|
64, 32);
|
||||||
|
std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(),
|
||||||
|
32, 32);
|
||||||
|
std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(),
|
||||||
|
0, 32);
|
||||||
|
kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b"
|
||||||
|
<< bits_1_32 << ",0b" << bits_33_64 << ",0b"
|
||||||
|
<< bits_65_96 << ",0b" << bits_97_128 << ");\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -1452,18 +1605,24 @@ struct RunTestForType
|
|||||||
num_elements_(num_elements), test_params_(test_params)
|
num_elements_(num_elements), test_params_(test_params)
|
||||||
{}
|
{}
|
||||||
template <typename T, typename U>
|
template <typename T, typename U>
|
||||||
int run_impl(const char *kernel_name, const char *source)
|
int run_impl(const std::string &function_name)
|
||||||
{
|
{
|
||||||
int error = TEST_PASS;
|
int error = TEST_PASS;
|
||||||
|
std::string source =
|
||||||
|
std::regex_replace(test_params_.get_kernel_source(function_name),
|
||||||
|
std::regex("\\%s"), function_name);
|
||||||
|
std::string kernel_name = "test_" + function_name;
|
||||||
if (test_params_.all_work_item_masks.size() > 0)
|
if (test_params_.all_work_item_masks.size() > 0)
|
||||||
{
|
{
|
||||||
error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
|
error = test<T, U>::mrun(device_, context_, queue_, num_elements_,
|
||||||
kernel_name, source, test_params_);
|
kernel_name.c_str(), source.c_str(),
|
||||||
|
test_params_);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
error = test<T, U>::run(device_, context_, queue_, num_elements_,
|
error = test<T, U>::run(device_, context_, queue_, num_elements_,
|
||||||
kernel_name, source, test_params_);
|
kernel_name.c_str(), source.c_str(),
|
||||||
|
test_params_);
|
||||||
}
|
}
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
|
|||||||
@@ -150,25 +150,25 @@ template <typename T>
|
|||||||
int run_broadcast_scan_reduction_for_type(RunTestForType rft)
|
int run_broadcast_scan_reduction_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
|
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
|
||||||
"test_bcast", bcast_source);
|
"sub_group_broadcast");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
|
error |=
|
||||||
redadd_source);
|
rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
|
error |=
|
||||||
redmax_source);
|
rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
|
error |=
|
||||||
redmin_source);
|
rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
|
||||||
scinadd_source);
|
"sub_group_scan_inclusive_add");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
|
||||||
scinmax_source);
|
"sub_group_scan_inclusive_max");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
|
||||||
scinmin_source);
|
"sub_group_scan_inclusive_min");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
|
||||||
scexadd_source);
|
"sub_group_scan_exclusive_add");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
|
||||||
scexmax_source);
|
"sub_group_scan_exclusive_max");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
|
||||||
scexmin_source);
|
"sub_group_scan_exclusive_min");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -181,11 +181,14 @@ int test_subgroup_functions(cl_device_id device, cl_context context,
|
|||||||
constexpr size_t global_work_size = 2000;
|
constexpr size_t global_work_size = 2000;
|
||||||
constexpr size_t local_work_size = 200;
|
constexpr size_t local_work_size = 200;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size);
|
WorkGroupParams test_params(global_work_size, local_work_size);
|
||||||
|
test_params.save_kernel_source(sub_group_reduction_scan_source);
|
||||||
|
test_params.save_kernel_source(sub_group_generic_source,
|
||||||
|
"sub_group_broadcast");
|
||||||
|
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
int error =
|
int error =
|
||||||
rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
|
rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("sub_group_any");
|
||||||
error |=
|
error |= rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("sub_group_all");
|
||||||
rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
|
|
||||||
error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
|
error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
|
||||||
error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
|
error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
|
||||||
error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
|
error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
|
||||||
|
|||||||
@@ -684,239 +684,127 @@ template <typename Ty, BallotOp operation> struct SMASK
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char *bcast_non_uniform_source =
|
std::string sub_group_non_uniform_broadcast_source = R"(
|
||||||
"__kernel void test_bcast_non_uniform(const __global Type *in, __global "
|
__kernel void test_sub_group_non_uniform_broadcast(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
"int4 *xy, __global Type *out)\n"
|
int gid = get_global_id(0);
|
||||||
"{\n"
|
XY(xy,gid);
|
||||||
" int gid = get_global_id(0);\n"
|
Type x = in[gid];
|
||||||
" XY(xy,gid);\n"
|
if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
|
||||||
" Type x = in[gid];\n"
|
out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);
|
||||||
" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
|
} else {
|
||||||
" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].z);\n"
|
out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);
|
||||||
" } else {\n"
|
}
|
||||||
" out[gid] = sub_group_non_uniform_broadcast(x, xy[gid].w);\n"
|
}
|
||||||
" }\n"
|
)";
|
||||||
"}\n";
|
std::string sub_group_broadcast_first_source = R"(
|
||||||
|
__kernel void test_sub_group_broadcast_first(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
|
int gid = get_global_id(0);
|
||||||
|
XY(xy,gid);
|
||||||
|
Type x = in[gid];
|
||||||
|
if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {
|
||||||
|
out[gid] = sub_group_broadcast_first(x);;
|
||||||
|
} else {
|
||||||
|
out[gid] = sub_group_broadcast_first(x);;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
std::string sub_group_ballot_bit_scan_find_source = R"(
|
||||||
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
|
int gid = get_global_id(0);
|
||||||
|
XY(xy,gid);
|
||||||
|
Type x = in[gid];
|
||||||
|
uint4 value = (uint4)(0,0,0,0);
|
||||||
|
value = (uint4)(%s(x),0,0,0);
|
||||||
|
out[gid] = value;
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
std::string sub_group_ballot_mask_source = R"(
|
||||||
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
|
int gid = get_global_id(0);
|
||||||
|
XY(xy,gid);
|
||||||
|
xy[gid].z = get_max_sub_group_size();
|
||||||
|
Type x = in[gid];
|
||||||
|
uint4 mask = %s();
|
||||||
|
out[gid] = mask;
|
||||||
|
}
|
||||||
|
)";
|
||||||
|
std::string sub_group_ballot_source = R"(
|
||||||
|
__kernel void test_sub_group_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
|
uint4 full_ballot = sub_group_ballot(1);
|
||||||
|
uint divergence_mask;
|
||||||
|
uint4 partial_ballot;
|
||||||
|
uint gid = get_global_id(0);
|
||||||
|
XY(xy,gid);
|
||||||
|
if (get_sub_group_local_id() & 1) {
|
||||||
|
divergence_mask = 0xaaaaaaaa;
|
||||||
|
partial_ballot = sub_group_ballot(1);
|
||||||
|
} else {
|
||||||
|
divergence_mask = 0x55555555;
|
||||||
|
partial_ballot = sub_group_ballot(1);
|
||||||
|
}
|
||||||
|
size_t lws = get_local_size(0);
|
||||||
|
uint4 masked_ballot = full_ballot;
|
||||||
|
masked_ballot.x &= divergence_mask;
|
||||||
|
masked_ballot.y &= divergence_mask;
|
||||||
|
masked_ballot.z &= divergence_mask;
|
||||||
|
masked_ballot.w &= divergence_mask;
|
||||||
|
out[gid] = all(masked_ballot == partial_ballot);
|
||||||
|
|
||||||
static const char *bcast_first_source =
|
}
|
||||||
"__kernel void test_bcast_first(const __global Type *in, __global int4 "
|
)";
|
||||||
"*xy, __global Type *out)\n"
|
std::string sub_group_inverse_ballot_source = R"(
|
||||||
"{\n"
|
__kernel void test_sub_group_inverse_ballot(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
" int gid = get_global_id(0);\n"
|
int gid = get_global_id(0);
|
||||||
" XY(xy,gid);\n"
|
XY(xy,gid);
|
||||||
" Type x = in[gid];\n"
|
Type x = in[gid];
|
||||||
" if (xy[gid].x < NR_OF_ACTIVE_WORK_ITEMS) {\n"
|
uint4 value = (uint4)(10,0,0,0);
|
||||||
" out[gid] = sub_group_broadcast_first(x);\n"
|
if (get_sub_group_local_id() & 1) {
|
||||||
" } else {\n"
|
uint4 partial_ballot_mask = (uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);
|
||||||
" out[gid] = sub_group_broadcast_first(x);\n"
|
if (sub_group_inverse_ballot(partial_ballot_mask)) {
|
||||||
" }\n"
|
value = (uint4)(1,0,0,1);
|
||||||
"}\n";
|
} else {
|
||||||
|
value = (uint4)(0,0,0,1);
|
||||||
static const char *ballot_bit_count_source =
|
}
|
||||||
"__kernel void test_sub_group_ballot_bit_count(const __global Type *in, "
|
} else {
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
uint4 partial_ballot_mask = (uint4)(0x55555555,0x55555555,0x55555555,0x55555555);
|
||||||
"{\n"
|
if (sub_group_inverse_ballot(partial_ballot_mask)) {
|
||||||
" int gid = get_global_id(0);\n"
|
value = (uint4)(1,0,0,2);
|
||||||
" XY(xy,gid);\n"
|
} else {
|
||||||
" Type x = in[gid];\n"
|
value = (uint4)(0,0,0,2);
|
||||||
" uint4 value = (uint4)(0,0,0,0);\n"
|
}
|
||||||
" value = (uint4)(sub_group_ballot_bit_count(x),0,0,0);\n"
|
}
|
||||||
" out[gid] = value;\n"
|
out[gid] = value;
|
||||||
"}\n";
|
}
|
||||||
|
)";
|
||||||
static const char *ballot_inclusive_scan_source =
|
std::string sub_group_ballot_bit_extract_source = R"(
|
||||||
"__kernel void test_sub_group_ballot_inclusive_scan(const __global Type "
|
__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
"*in, __global int4 *xy, __global Type *out)\n"
|
int gid = get_global_id(0);
|
||||||
"{\n"
|
XY(xy,gid);
|
||||||
" int gid = get_global_id(0);\n"
|
Type x = in[gid];
|
||||||
" XY(xy,gid);\n"
|
uint index = xy[gid].z;
|
||||||
" Type x = in[gid];\n"
|
uint4 value = (uint4)(10,0,0,0);
|
||||||
" uint4 value = (uint4)(0,0,0,0);\n"
|
if (get_sub_group_local_id() & 1) {
|
||||||
" value = (uint4)(sub_group_ballot_inclusive_scan(x),0,0,0);\n"
|
if (sub_group_ballot_bit_extract(x, xy[gid].z)) {
|
||||||
" out[gid] = value;\n"
|
value = (uint4)(1,0,0,1);
|
||||||
"}\n";
|
} else {
|
||||||
|
value = (uint4)(0,0,0,1);
|
||||||
static const char *ballot_exclusive_scan_source =
|
}
|
||||||
"__kernel void test_sub_group_ballot_exclusive_scan(const __global Type "
|
} else {
|
||||||
"*in, __global int4 *xy, __global Type *out)\n"
|
if (sub_group_ballot_bit_extract(x, xy[gid].w)) {
|
||||||
"{\n"
|
value = (uint4)(1,0,0,2);
|
||||||
" int gid = get_global_id(0);\n"
|
} else {
|
||||||
" XY(xy,gid);\n"
|
value = (uint4)(0,0,0,2);
|
||||||
" Type x = in[gid];\n"
|
}
|
||||||
" uint4 value = (uint4)(0,0,0,0);\n"
|
}
|
||||||
" value = (uint4)(sub_group_ballot_exclusive_scan(x),0,0,0);\n"
|
out[gid] = value;
|
||||||
" out[gid] = value;\n"
|
}
|
||||||
"}\n";
|
)";
|
||||||
|
|
||||||
static const char *ballot_find_lsb_source =
|
|
||||||
"__kernel void test_sub_group_ballot_find_lsb(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 value = (uint4)(0,0,0,0);\n"
|
|
||||||
" value = (uint4)(sub_group_ballot_find_lsb(x),0,0,0);\n"
|
|
||||||
" out[gid] = value;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *ballot_find_msb_source =
|
|
||||||
"__kernel void test_sub_group_ballot_find_msb(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 value = (uint4)(0,0,0,0);"
|
|
||||||
" value = (uint4)(sub_group_ballot_find_msb(x),0,0,0);"
|
|
||||||
" out[gid] = value ;"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *get_subgroup_ge_mask_source =
|
|
||||||
"__kernel void test_get_sub_group_ge_mask(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].z = get_max_sub_group_size();\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 mask = get_sub_group_ge_mask();"
|
|
||||||
" out[gid] = mask;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *get_subgroup_gt_mask_source =
|
|
||||||
"__kernel void test_get_sub_group_gt_mask(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].z = get_max_sub_group_size();\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 mask = get_sub_group_gt_mask();"
|
|
||||||
" out[gid] = mask;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *get_subgroup_le_mask_source =
|
|
||||||
"__kernel void test_get_sub_group_le_mask(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].z = get_max_sub_group_size();\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 mask = get_sub_group_le_mask();"
|
|
||||||
" out[gid] = mask;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *get_subgroup_lt_mask_source =
|
|
||||||
"__kernel void test_get_sub_group_lt_mask(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].z = get_max_sub_group_size();\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 mask = get_sub_group_lt_mask();"
|
|
||||||
" out[gid] = mask;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *get_subgroup_eq_mask_source =
|
|
||||||
"__kernel void test_get_sub_group_eq_mask(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].z = get_max_sub_group_size();\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 mask = get_sub_group_eq_mask();"
|
|
||||||
" out[gid] = mask;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *ballot_source =
|
|
||||||
"__kernel void test_sub_group_ballot(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
"uint4 full_ballot = sub_group_ballot(1);\n"
|
|
||||||
"uint divergence_mask;\n"
|
|
||||||
"uint4 partial_ballot;\n"
|
|
||||||
"uint gid = get_global_id(0);"
|
|
||||||
"XY(xy,gid);\n"
|
|
||||||
"if (get_sub_group_local_id() & 1) {\n"
|
|
||||||
" divergence_mask = 0xaaaaaaaa;\n"
|
|
||||||
" partial_ballot = sub_group_ballot(1);\n"
|
|
||||||
"} else {\n"
|
|
||||||
" divergence_mask = 0x55555555;\n"
|
|
||||||
" partial_ballot = sub_group_ballot(1);\n"
|
|
||||||
"}\n"
|
|
||||||
" size_t lws = get_local_size(0);\n"
|
|
||||||
"uint4 masked_ballot = full_ballot;\n"
|
|
||||||
"masked_ballot.x &= divergence_mask;\n"
|
|
||||||
"masked_ballot.y &= divergence_mask;\n"
|
|
||||||
"masked_ballot.z &= divergence_mask;\n"
|
|
||||||
"masked_ballot.w &= divergence_mask;\n"
|
|
||||||
"out[gid] = all(masked_ballot == partial_ballot);\n"
|
|
||||||
|
|
||||||
"} \n";
|
|
||||||
|
|
||||||
static const char *ballot_source_inverse =
|
|
||||||
"__kernel void test_sub_group_ballot_inverse(const __global "
|
|
||||||
"Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint4 value = (uint4)(10,0,0,0);\n"
|
|
||||||
" if (get_sub_group_local_id() & 1) {"
|
|
||||||
" uint4 partial_ballot_mask = "
|
|
||||||
"(uint4)(0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA,0xAAAAAAAA);"
|
|
||||||
" if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
|
|
||||||
" value = (uint4)(1,0,0,1);\n"
|
|
||||||
" } else {\n"
|
|
||||||
" value = (uint4)(0,0,0,1);\n"
|
|
||||||
" }\n"
|
|
||||||
" } else {\n"
|
|
||||||
" uint4 partial_ballot_mask = "
|
|
||||||
"(uint4)(0x55555555,0x55555555,0x55555555,0x55555555);"
|
|
||||||
" if (sub_group_inverse_ballot(partial_ballot_mask)) {\n"
|
|
||||||
" value = (uint4)(1,0,0,2);\n"
|
|
||||||
" } else {\n"
|
|
||||||
" value = (uint4)(0,0,0,2);\n"
|
|
||||||
" }\n"
|
|
||||||
" }\n"
|
|
||||||
" out[gid] = value;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *ballot_bit_extract_source =
|
|
||||||
"__kernel void test_sub_group_ballot_bit_extract(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" uint index = xy[gid].z;\n"
|
|
||||||
" uint4 value = (uint4)(10,0,0,0);\n"
|
|
||||||
" if (get_sub_group_local_id() & 1) {"
|
|
||||||
" if (sub_group_ballot_bit_extract(x, xy[gid].z)) {\n"
|
|
||||||
" value = (uint4)(1,0,0,1);\n"
|
|
||||||
" } else {\n"
|
|
||||||
" value = (uint4)(0,0,0,1);\n"
|
|
||||||
" }\n"
|
|
||||||
" } else {\n"
|
|
||||||
" if (sub_group_ballot_bit_extract(x, xy[gid].w)) {\n"
|
|
||||||
" value = (uint4)(1,0,0,2);\n"
|
|
||||||
" } else {\n"
|
|
||||||
" value = (uint4)(0,0,0,2);\n"
|
|
||||||
" }\n"
|
|
||||||
" }\n"
|
|
||||||
" out[gid] = value;\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
|
template <typename T> int run_non_uniform_broadcast_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error =
|
int error =
|
||||||
rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
|
rft.run_impl<T, BC<T, SubgroupsBroadcastOp::non_uniform_broadcast>>(
|
||||||
"test_bcast_non_uniform", bcast_non_uniform_source);
|
"sub_group_non_uniform_broadcast");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -932,9 +820,15 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
|
|||||||
"skipping test.\n");
|
"skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr size_t global_work_size = 170;
|
constexpr size_t global_work_size = 170;
|
||||||
constexpr size_t local_work_size = 64;
|
constexpr size_t local_work_size = 64;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size);
|
WorkGroupParams test_params(global_work_size, local_work_size);
|
||||||
|
test_params.save_kernel_source(sub_group_ballot_mask_source);
|
||||||
|
test_params.save_kernel_source(sub_group_non_uniform_broadcast_source,
|
||||||
|
"sub_group_non_uniform_broadcast");
|
||||||
|
test_params.save_kernel_source(sub_group_broadcast_first_source,
|
||||||
|
"sub_group_broadcast_first");
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
|
|
||||||
// non uniform broadcast functions
|
// non uniform broadcast functions
|
||||||
@@ -1018,76 +912,87 @@ int test_subgroup_functions_ballot(cl_device_id device, cl_context context,
|
|||||||
// broadcast first functions
|
// broadcast first functions
|
||||||
error |=
|
error |=
|
||||||
rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
|
rft.run_impl<cl_int, BC<cl_int, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_uint,
|
error |= rft.run_impl<cl_uint,
|
||||||
BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_uint, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_long,
|
error |= rft.run_impl<cl_long,
|
||||||
BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_long, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_ulong,
|
error |= rft.run_impl<cl_ulong,
|
||||||
BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_ulong, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_short,
|
error |= rft.run_impl<cl_short,
|
||||||
BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_short, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_ushort,
|
error |= rft.run_impl<cl_ushort,
|
||||||
BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_ushort, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_char,
|
error |= rft.run_impl<cl_char,
|
||||||
BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_char, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_uchar,
|
error |= rft.run_impl<cl_uchar,
|
||||||
BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_uchar, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_float,
|
error |= rft.run_impl<cl_float,
|
||||||
BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_float, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<cl_double,
|
error |= rft.run_impl<cl_double,
|
||||||
BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<cl_double, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
error |= rft.run_impl<
|
error |= rft.run_impl<
|
||||||
subgroups::cl_half,
|
subgroups::cl_half,
|
||||||
BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
|
BC<subgroups::cl_half, SubgroupsBroadcastOp::broadcast_first>>(
|
||||||
"test_bcast_first", bcast_first_source);
|
"sub_group_broadcast_first");
|
||||||
|
|
||||||
// mask functions
|
// mask functions
|
||||||
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
|
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::eq_mask>>(
|
||||||
"test_get_sub_group_eq_mask", get_subgroup_eq_mask_source);
|
"get_sub_group_eq_mask");
|
||||||
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
|
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::ge_mask>>(
|
||||||
"test_get_sub_group_ge_mask", get_subgroup_ge_mask_source);
|
"get_sub_group_ge_mask");
|
||||||
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
|
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::gt_mask>>(
|
||||||
"test_get_sub_group_gt_mask", get_subgroup_gt_mask_source);
|
"get_sub_group_gt_mask");
|
||||||
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
|
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::le_mask>>(
|
||||||
"test_get_sub_group_le_mask", get_subgroup_le_mask_source);
|
"get_sub_group_le_mask");
|
||||||
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
|
error |= rft.run_impl<cl_uint4, SMASK<cl_uint4, BallotOp::lt_mask>>(
|
||||||
"test_get_sub_group_lt_mask", get_subgroup_lt_mask_source);
|
"get_sub_group_lt_mask");
|
||||||
|
|
||||||
// ballot functions
|
// ballot functions
|
||||||
error |= rft.run_impl<cl_uint, BALLOT<cl_uint>>("test_sub_group_ballot",
|
WorkGroupParams test_params_ballot(global_work_size, local_work_size);
|
||||||
ballot_source);
|
test_params_ballot.save_kernel_source(
|
||||||
error |= rft.run_impl<cl_uint4,
|
sub_group_ballot_bit_scan_find_source);
|
||||||
BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
|
test_params_ballot.save_kernel_source(sub_group_ballot_source,
|
||||||
"test_sub_group_ballot_inverse", ballot_source_inverse);
|
"sub_group_ballot");
|
||||||
error |= rft.run_impl<
|
test_params_ballot.save_kernel_source(sub_group_inverse_ballot_source,
|
||||||
|
"sub_group_inverse_ballot");
|
||||||
|
test_params_ballot.save_kernel_source(sub_group_ballot_bit_extract_source,
|
||||||
|
"sub_group_ballot_bit_extract");
|
||||||
|
RunTestForType rft_ballot(device, context, queue, num_elements,
|
||||||
|
test_params_ballot);
|
||||||
|
error |= rft_ballot.run_impl<cl_uint, BALLOT<cl_uint>>("sub_group_ballot");
|
||||||
|
error |=
|
||||||
|
rft_ballot.run_impl<cl_uint4,
|
||||||
|
BALLOT_INVERSE<cl_uint4, BallotOp::inverse_ballot>>(
|
||||||
|
"sub_group_inverse_ballot");
|
||||||
|
error |= rft_ballot.run_impl<
|
||||||
cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
|
cl_uint4, BALLOT_BIT_EXTRACT<cl_uint4, BallotOp::ballot_bit_extract>>(
|
||||||
"test_sub_group_ballot_bit_extract", ballot_bit_extract_source);
|
"sub_group_ballot_bit_extract");
|
||||||
error |= rft.run_impl<
|
error |= rft_ballot.run_impl<
|
||||||
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
|
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_bit_count>>(
|
||||||
"test_sub_group_ballot_bit_count", ballot_bit_count_source);
|
"sub_group_ballot_bit_count");
|
||||||
error |= rft.run_impl<
|
error |= rft_ballot.run_impl<
|
||||||
cl_uint4,
|
cl_uint4,
|
||||||
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
|
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_inclusive_scan>>(
|
||||||
"test_sub_group_ballot_inclusive_scan", ballot_inclusive_scan_source);
|
"sub_group_ballot_inclusive_scan");
|
||||||
error |= rft.run_impl<
|
error |= rft_ballot.run_impl<
|
||||||
cl_uint4,
|
cl_uint4,
|
||||||
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
|
BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_exclusive_scan>>(
|
||||||
"test_sub_group_ballot_exclusive_scan", ballot_exclusive_scan_source);
|
"sub_group_ballot_exclusive_scan");
|
||||||
error |= rft.run_impl<
|
error |= rft_ballot.run_impl<
|
||||||
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
|
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_lsb>>(
|
||||||
"test_sub_group_ballot_find_lsb", ballot_find_lsb_source);
|
"sub_group_ballot_find_lsb");
|
||||||
error |= rft.run_impl<
|
error |= rft_ballot.run_impl<
|
||||||
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
|
cl_uint4, BALLOT_COUNT_SCAN_FIND<cl_uint4, BallotOp::ballot_find_msb>>(
|
||||||
"test_sub_group_ballot_find_msb", ballot_find_msb_source);
|
"sub_group_ballot_find_msb");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,149 +22,17 @@
|
|||||||
#define CLUSTER_SIZE_STR "4"
|
#define CLUSTER_SIZE_STR "4"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
static const char *redadd_clustered_source =
|
std::string sub_group_clustered_reduce_source = R"(
|
||||||
"__kernel void test_redadd_clustered(const __global Type *in, __global "
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
"int4 *xy, __global Type *out)\n"
|
int gid = get_global_id(0);
|
||||||
"{\n"
|
XY(xy,gid);
|
||||||
" int gid = get_global_id(0);\n"
|
xy[gid].w = 0;
|
||||||
" XY(xy,gid);\n"
|
if (sizeof(in[gid]) == sizeof(%s(in[gid], )" CLUSTER_SIZE_STR R"())) {
|
||||||
" xy[gid].w = 0;\n"
|
xy[gid].w = sizeof(in[gid]);
|
||||||
" if (sizeof(in[gid]) == "
|
}
|
||||||
"sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
out[gid] = %s(in[gid], )" CLUSTER_SIZE_STR R"();
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
}
|
||||||
" out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
|
)";
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redmax_clustered_source =
|
|
||||||
"__kernel void test_redmax_clustered(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redmin_clustered_source =
|
|
||||||
"__kernel void test_redmin_clustered(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redmul_clustered_source =
|
|
||||||
"__kernel void test_redmul_clustered(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redand_clustered_source =
|
|
||||||
"__kernel void test_redand_clustered(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redor_clustered_source =
|
|
||||||
"__kernel void test_redor_clustered(const __global Type *in, __global int4 "
|
|
||||||
"*xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redxor_clustered_source =
|
|
||||||
"__kernel void test_redxor_clustered(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redand_clustered_logical_source =
|
|
||||||
"__kernel void test_redand_clustered_logical(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = "
|
|
||||||
"sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redor_clustered_logical_source =
|
|
||||||
"__kernel void test_redor_clustered_logical(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if (sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = "
|
|
||||||
"sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char *redxor_clustered_logical_source =
|
|
||||||
"__kernel void test_redxor_clustered_logical(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" xy[gid].w = 0;\n"
|
|
||||||
" if ( sizeof(in[gid]) == "
|
|
||||||
"sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
|
|
||||||
")))\n"
|
|
||||||
" {xy[gid].w = sizeof(in[gid]);}\n"
|
|
||||||
" out[gid] = "
|
|
||||||
"sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
|
|
||||||
// DESCRIPTION:
|
// DESCRIPTION:
|
||||||
// Test for reduce cluster functions
|
// Test for reduce cluster functions
|
||||||
@@ -267,34 +135,34 @@ template <typename T>
|
|||||||
int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
|
int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
|
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
|
||||||
"test_redadd_clustered", redadd_clustered_source);
|
"sub_group_clustered_reduce_add");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
|
||||||
"test_redmax_clustered", redmax_clustered_source);
|
"sub_group_clustered_reduce_max");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
|
||||||
"test_redmin_clustered", redmin_clustered_source);
|
"sub_group_clustered_reduce_min");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
|
||||||
"test_redmul_clustered", redmul_clustered_source);
|
"sub_group_clustered_reduce_mul");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
|
template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
|
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
|
||||||
"test_redand_clustered", redand_clustered_source);
|
"sub_group_clustered_reduce_and");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
|
||||||
"test_redor_clustered", redor_clustered_source);
|
"sub_group_clustered_reduce_or");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
|
||||||
"test_redxor_clustered", redxor_clustered_source);
|
"sub_group_clustered_reduce_xor");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
template <typename T>
|
template <typename T>
|
||||||
int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
|
int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
|
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
|
||||||
"test_redand_clustered_logical", redand_clustered_logical_source);
|
"sub_group_clustered_reduce_logical_and");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
|
||||||
"test_redor_clustered_logical", redor_clustered_logical_source);
|
"sub_group_clustered_reduce_logical_or");
|
||||||
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
|
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
|
||||||
"test_redxor_clustered_logical", redxor_clustered_logical_source);
|
"sub_group_clustered_reduce_logical_xor");
|
||||||
|
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
@@ -311,9 +179,11 @@ int test_subgroup_functions_clustered_reduce(cl_device_id device,
|
|||||||
"device, skipping test.\n");
|
"device, skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr size_t global_work_size = 2000;
|
constexpr size_t global_work_size = 2000;
|
||||||
constexpr size_t local_work_size = 200;
|
constexpr size_t local_work_size = 200;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size);
|
WorkGroupParams test_params(global_work_size, local_work_size);
|
||||||
|
test_params.save_kernel_source(sub_group_clustered_reduce_source);
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
|
|
||||||
int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
|
int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
|
||||||
|
|||||||
@@ -24,30 +24,30 @@ namespace {
|
|||||||
template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
|
template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
|
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
|
||||||
"test_bcast", bcast_source);
|
"sub_group_broadcast");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
|
template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
|
int error =
|
||||||
redadd_source);
|
rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("sub_group_reduce_add");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
|
error |=
|
||||||
redmax_source);
|
rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("sub_group_reduce_max");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
|
error |=
|
||||||
redmin_source);
|
rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("sub_group_reduce_min");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
|
||||||
scinadd_source);
|
"sub_group_scan_inclusive_add");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
|
||||||
scinmax_source);
|
"sub_group_scan_inclusive_max");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
|
||||||
scinmin_source);
|
"sub_group_scan_inclusive_min");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
|
||||||
scexadd_source);
|
"sub_group_scan_exclusive_add");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
|
||||||
scexmax_source);
|
"sub_group_scan_exclusive_max");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
|
||||||
scexmin_source);
|
"sub_group_scan_exclusive_min");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,11 +65,15 @@ int test_subgroup_functions_extended_types(cl_device_id device,
|
|||||||
"device, skipping test.\n");
|
"device, skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr size_t global_work_size = 2000;
|
constexpr size_t global_work_size = 2000;
|
||||||
constexpr size_t local_work_size = 200;
|
constexpr size_t local_work_size = 200;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size);
|
WorkGroupParams test_params(global_work_size, local_work_size);
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
test_params.save_kernel_source(sub_group_reduction_scan_source);
|
||||||
|
test_params.save_kernel_source(sub_group_generic_source,
|
||||||
|
"sub_group_broadcast");
|
||||||
|
|
||||||
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
int error = run_broadcast_for_extended_type<cl_uint2>(rft);
|
int error = run_broadcast_for_extended_type<cl_uint2>(rft);
|
||||||
error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
|
error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
|
||||||
error |= run_broadcast_for_extended_type<cl_uint4>(rft);
|
error |= run_broadcast_for_extended_type<cl_uint4>(rft);
|
||||||
|
|||||||
@@ -17,336 +17,29 @@
|
|||||||
#include "subhelpers.h"
|
#include "subhelpers.h"
|
||||||
#include "harness/typeWrappers.h"
|
#include "harness/typeWrappers.h"
|
||||||
#include "subgroup_common_templates.h"
|
#include "subgroup_common_templates.h"
|
||||||
|
#include <cstdio>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
static const char *scinadd_non_uniform_source = R"(
|
std::string sub_group_non_uniform_arithmetic_source = R"(
|
||||||
__kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
int gid = get_global_id(0);
|
int gid = get_global_id(0);
|
||||||
XY(xy,gid);
|
XY(xy,gid);
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
uint subgroup_local_id = get_sub_group_local_id();
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
uint elect_work_item = 1 << (subgroup_local_id % 32);
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
|
uint work_item_mask;
|
||||||
}
|
if(subgroup_local_id < 32) {
|
||||||
}
|
work_item_mask = work_item_mask_vector.x;
|
||||||
)";
|
} else if(subgroup_local_id < 64) {
|
||||||
|
work_item_mask = work_item_mask_vector.y;
|
||||||
static const char *scinmax_non_uniform_source = R"(
|
} else if(subgroup_local_id < 96) {
|
||||||
__kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
work_item_mask = work_item_mask_vector.w;
|
||||||
int gid = get_global_id(0);
|
} else if(subgroup_local_id < 128) {
|
||||||
XY(xy,gid);
|
work_item_mask = work_item_mask_vector.z;
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
}
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
if (elect_work_item & work_item_mask){
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
|
out[gid] = %s(in[gid]);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinmin_non_uniform_source = R"(
|
|
||||||
__kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinmul_non_uniform_source = R"(
|
|
||||||
__kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinand_non_uniform_source = R"(
|
|
||||||
__kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinor_non_uniform_source = R"(
|
|
||||||
__kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinxor_non_uniform_source = R"(
|
|
||||||
__kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinand_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinor_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scinxor_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexadd_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexmax_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexmin_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexmul_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexand_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexor_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexxor_non_uniform_source = R"(
|
|
||||||
__kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexand_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexor_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *scexxor_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redadd_non_uniform_source = R"(
|
|
||||||
__kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redmax_non_uniform_source = R"(
|
|
||||||
__kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redmin_non_uniform_source = R"(
|
|
||||||
__kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redmul_non_uniform_source = R"(
|
|
||||||
__kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redand_non_uniform_source = R"(
|
|
||||||
__kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redor_non_uniform_source = R"(
|
|
||||||
__kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redxor_non_uniform_source = R"(
|
|
||||||
__kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redand_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redor_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *redxor_non_uniform_logical_source = R"(
|
|
||||||
__kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
)";
|
)";
|
||||||
|
|
||||||
@@ -354,52 +47,52 @@ template <typename T>
|
|||||||
int run_functions_add_mul_max_min_for_type(RunTestForType rft)
|
int run_functions_add_mul_max_min_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
|
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
|
||||||
"test_scinadd_non_uniform", scinadd_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_add");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
|
||||||
"test_scinmul_non_uniform", scinmul_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_mul");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
|
||||||
"test_scinmax_non_uniform", scinmax_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_max");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
|
||||||
"test_scinmin_non_uniform", scinmin_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_min");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
|
||||||
"test_scexadd_non_uniform", scexadd_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_add");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
|
||||||
"test_scexmul_non_uniform", scexmul_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_mul");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
|
||||||
"test_scexmax_non_uniform", scexmax_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_max");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
|
||||||
"test_scexmin_non_uniform", scexmin_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_min");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
|
||||||
"test_redadd_non_uniform", redadd_non_uniform_source);
|
"sub_group_non_uniform_reduce_add");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
|
||||||
"test_redmul_non_uniform", redmul_non_uniform_source);
|
"sub_group_non_uniform_reduce_mul");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
|
||||||
"test_redmax_non_uniform", redmax_non_uniform_source);
|
"sub_group_non_uniform_reduce_max");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
|
||||||
"test_redmin_non_uniform", redmin_non_uniform_source);
|
"sub_group_non_uniform_reduce_min");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
|
template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
|
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
|
||||||
"test_scinand_non_uniform", scinand_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_and");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
|
||||||
"test_scinor_non_uniform", scinor_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_or");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
|
||||||
"test_scinxor_non_uniform", scinxor_non_uniform_source);
|
"sub_group_non_uniform_scan_inclusive_xor");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
|
||||||
"test_scexand_non_uniform", scexand_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_and");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
|
||||||
"test_scexor_non_uniform", scexor_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_or");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
|
||||||
"test_scexxor_non_uniform", scexxor_non_uniform_source);
|
"sub_group_non_uniform_scan_exclusive_xor");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
|
||||||
"test_redand_non_uniform", redand_non_uniform_source);
|
"sub_group_non_uniform_reduce_and");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
|
||||||
"test_redor_non_uniform", redor_non_uniform_source);
|
"sub_group_non_uniform_reduce_or");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
|
||||||
"test_redxor_non_uniform", redxor_non_uniform_source);
|
"sub_group_non_uniform_reduce_xor");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -407,23 +100,23 @@ template <typename T>
|
|||||||
int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
|
int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
|
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
|
||||||
"test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
|
"sub_group_non_uniform_scan_inclusive_logical_and");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
|
||||||
"test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
|
"sub_group_non_uniform_scan_inclusive_logical_or");
|
||||||
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
|
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
|
||||||
"test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
|
"sub_group_non_uniform_scan_inclusive_logical_xor");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
|
||||||
"test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
|
"sub_group_non_uniform_scan_exclusive_logical_and");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
|
||||||
"test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
|
"sub_group_non_uniform_scan_exclusive_logical_or");
|
||||||
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
|
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
|
||||||
"test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
|
"sub_group_non_uniform_scan_exclusive_logical_xor");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
|
||||||
"test_redand_non_uniform_logical", redand_non_uniform_logical_source);
|
"sub_group_non_uniform_reduce_logical_and");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
|
||||||
"test_redor_non_uniform_logical", redor_non_uniform_logical_source);
|
"sub_group_non_uniform_reduce_logical_or");
|
||||||
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
|
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
|
||||||
"test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
|
"sub_group_non_uniform_reduce_logical_xor");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -441,13 +134,11 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
|
|||||||
"this device, skipping test.\n");
|
"this device, skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
|
|
||||||
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
|
|
||||||
0x00ffff00, 0x80000000, 0xaaaaaaaa };
|
|
||||||
|
|
||||||
constexpr size_t global_work_size = 2000;
|
constexpr size_t global_work_size = 2000;
|
||||||
constexpr size_t local_work_size = 200;
|
constexpr size_t local_work_size = 200;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size, masks);
|
WorkGroupParams test_params(global_work_size, local_work_size, true);
|
||||||
|
test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
|
|
||||||
int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
|
int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
|
||||||
|
|||||||
@@ -28,7 +28,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
|
||||||
int nj = (nw + ns - 1) / ns;
|
int nj = (nw + ns - 1) / ns;
|
||||||
int non_uniform_size = ng % nw;
|
int non_uniform_size = ng % nw;
|
||||||
ng = ng / nw;
|
ng = ng / nw;
|
||||||
@@ -40,9 +39,11 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
|
|||||||
operation_names(operation));
|
operation_names(operation));
|
||||||
|
|
||||||
log_info(" test params: global size = %d local size = %d subgroups "
|
log_info(" test params: global size = %d local size = %d subgroups "
|
||||||
"size = %d work item mask = 0x%x data type (%s)\n",
|
"size = %d data type (%s)\n",
|
||||||
test_params.global_workgroup_size, nw, ns, work_items_mask,
|
test_params.global_workgroup_size, nw, ns,
|
||||||
TypeManager<T>::name());
|
TypeManager<T>::name());
|
||||||
|
log_info(" work items mask: %s\n",
|
||||||
|
test_params.work_items_mask.to_string().c_str());
|
||||||
if (non_uniform_size)
|
if (non_uniform_size)
|
||||||
{
|
{
|
||||||
log_info(" non uniform work group size mode ON\n");
|
log_info(" non uniform work group size mode ON\n");
|
||||||
@@ -99,7 +100,6 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
|
|||||||
int nw = test_params.local_workgroup_size;
|
int nw = test_params.local_workgroup_size;
|
||||||
int ns = test_params.subgroup_size;
|
int ns = test_params.subgroup_size;
|
||||||
int ng = test_params.global_workgroup_size;
|
int ng = test_params.global_workgroup_size;
|
||||||
uint32_t work_items_mask = test_params.work_items_mask;
|
|
||||||
int nj = (nw + ns - 1) / ns;
|
int nj = (nw + ns - 1) / ns;
|
||||||
cl_int tr, rr;
|
cl_int tr, rr;
|
||||||
int non_uniform_size = ng % nw;
|
int non_uniform_size = ng % nw;
|
||||||
@@ -141,8 +141,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
|
|||||||
std::set<int> active_work_items;
|
std::set<int> active_work_items;
|
||||||
for (i = 0; i < n; ++i)
|
for (i = 0; i < n; ++i)
|
||||||
{
|
{
|
||||||
uint32_t check_work_item = 1 << (i % 32);
|
if (test_params.work_items_mask.test(i))
|
||||||
if (work_items_mask & check_work_item)
|
|
||||||
{
|
{
|
||||||
active_work_items.insert(i);
|
active_work_items.insert(i);
|
||||||
switch (operation)
|
switch (operation)
|
||||||
@@ -215,46 +214,47 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
|
|||||||
return TEST_PASS;
|
return TEST_PASS;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
static const char *elect_source = R"(
|
|
||||||
__kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
|
std::string sub_group_elect_source = R"(
|
||||||
|
__kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
int gid = get_global_id(0);
|
int gid = get_global_id(0);
|
||||||
XY(xy,gid);
|
XY(xy,gid);
|
||||||
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
uint subgroup_local_id = get_sub_group_local_id();
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
uint elect_work_item = 1 << (subgroup_local_id % 32);
|
||||||
out[gid] = sub_group_elect();
|
uint work_item_mask;
|
||||||
}
|
if(subgroup_local_id < 32) {
|
||||||
|
work_item_mask = work_item_mask_vector.x;
|
||||||
|
} else if(subgroup_local_id < 64) {
|
||||||
|
work_item_mask = work_item_mask_vector.y;
|
||||||
|
} else if(subgroup_local_id < 96) {
|
||||||
|
work_item_mask = work_item_mask_vector.w;
|
||||||
|
} else if(subgroup_local_id < 128) {
|
||||||
|
work_item_mask = work_item_mask_vector.z;
|
||||||
|
}
|
||||||
|
if (elect_work_item & work_item_mask){
|
||||||
|
out[gid] = sub_group_elect();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
)";
|
)";
|
||||||
|
|
||||||
static const char *non_uniform_any_source = R"(
|
std::string sub_group_non_uniform_any_all_all_equal_source = R"(
|
||||||
__kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
|
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) {
|
||||||
int gid = get_global_id(0);
|
int gid = get_global_id(0);
|
||||||
XY(xy,gid);
|
XY(xy,gid);
|
||||||
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
uint subgroup_local_id = get_sub_group_local_id();
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
uint elect_work_item = 1 << (subgroup_local_id % 32);
|
||||||
out[gid] = sub_group_non_uniform_any(in[gid]);
|
uint work_item_mask;
|
||||||
}
|
if(subgroup_local_id < 32) {
|
||||||
}
|
work_item_mask = work_item_mask_vector.x;
|
||||||
)";
|
} else if(subgroup_local_id < 64) {
|
||||||
|
work_item_mask = work_item_mask_vector.y;
|
||||||
static const char *non_uniform_all_source = R"(
|
} else if(subgroup_local_id < 96) {
|
||||||
__kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
|
work_item_mask = work_item_mask_vector.w;
|
||||||
int gid = get_global_id(0);
|
} else if(subgroup_local_id < 128) {
|
||||||
XY(xy,gid);
|
work_item_mask = work_item_mask_vector.z;
|
||||||
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
}
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
if (elect_work_item & work_item_mask){
|
||||||
out[gid] = sub_group_non_uniform_all(in[gid]);
|
out[gid] = %s(in[gid]);
|
||||||
}
|
|
||||||
}
|
|
||||||
)";
|
|
||||||
|
|
||||||
static const char *non_uniform_all_equal_source = R"(
|
|
||||||
__kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
|
|
||||||
int gid = get_global_id(0);
|
|
||||||
XY(xy,gid);
|
|
||||||
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
|
|
||||||
if (elect_work_item & WORK_ITEMS_MASK){
|
|
||||||
out[gid] = sub_group_non_uniform_all_equal(in[gid]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
)";
|
)";
|
||||||
@@ -262,7 +262,7 @@ static const char *non_uniform_all_equal_source = R"(
|
|||||||
template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
|
template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
|
int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
|
||||||
"test_non_uniform_all_equal", non_uniform_all_equal_source);
|
"sub_group_non_uniform_all_equal");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -278,12 +278,13 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
|
|||||||
"device, skipping test.\n");
|
"device, skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
|
|
||||||
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
|
|
||||||
0x00ffff00, 0x80000000 };
|
|
||||||
constexpr size_t global_work_size = 170;
|
constexpr size_t global_work_size = 170;
|
||||||
constexpr size_t local_work_size = 64;
|
constexpr size_t local_work_size = 64;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size, masks);
|
WorkGroupParams test_params(global_work_size, local_work_size, true);
|
||||||
|
test_params.save_kernel_source(
|
||||||
|
sub_group_non_uniform_any_all_all_equal_source);
|
||||||
|
test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
|
|
||||||
int error = run_vote_all_equal_for_type<cl_int>(rft);
|
int error = run_vote_all_equal_for_type<cl_int>(rft);
|
||||||
@@ -295,10 +296,10 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
|
|||||||
error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
|
error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
|
||||||
|
|
||||||
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
|
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
|
||||||
"test_non_uniform_all", non_uniform_all_source);
|
"sub_group_non_uniform_all");
|
||||||
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
|
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
|
||||||
"test_elect", elect_source);
|
"sub_group_elect");
|
||||||
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
|
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
|
||||||
"test_non_uniform_any", non_uniform_any_source);
|
"sub_group_non_uniform_any");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,38 +15,19 @@
|
|||||||
//
|
//
|
||||||
#include "procs.h"
|
#include "procs.h"
|
||||||
#include "subhelpers.h"
|
#include "subhelpers.h"
|
||||||
|
#include "subgroup_common_kernels.h"
|
||||||
#include "subgroup_common_templates.h"
|
#include "subgroup_common_templates.h"
|
||||||
#include "harness/typeWrappers.h"
|
#include "harness/typeWrappers.h"
|
||||||
#include <bitset>
|
#include <bitset>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
static const char* shuffle_xor_source =
|
|
||||||
"__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
static const char* shuffle_source =
|
|
||||||
"__kernel void test_sub_group_shuffle(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" out[gid] = sub_group_shuffle(x, xy[gid].z);"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
template <typename T> int run_shuffle_for_type(RunTestForType rft)
|
template <typename T> int run_shuffle_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
|
int error =
|
||||||
"test_sub_group_shuffle", shuffle_source);
|
rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>("sub_group_shuffle");
|
||||||
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
|
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
|
||||||
"test_sub_group_shuffle_xor", shuffle_xor_source);
|
"sub_group_shuffle_xor");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -61,9 +42,11 @@ int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
|
|||||||
"skipping test.\n");
|
"skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr size_t global_work_size = 2000;
|
constexpr size_t global_work_size = 2000;
|
||||||
constexpr size_t local_work_size = 200;
|
constexpr size_t local_work_size = 200;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size);
|
WorkGroupParams test_params(global_work_size, local_work_size);
|
||||||
|
test_params.save_kernel_source(sub_group_generic_source);
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
|
|
||||||
int error = run_shuffle_for_type<cl_int>(rft);
|
int error = run_shuffle_for_type<cl_int>(rft);
|
||||||
|
|||||||
@@ -15,37 +15,19 @@
|
|||||||
//
|
//
|
||||||
#include "procs.h"
|
#include "procs.h"
|
||||||
#include "subhelpers.h"
|
#include "subhelpers.h"
|
||||||
|
#include "subgroup_common_kernels.h"
|
||||||
#include "subgroup_common_templates.h"
|
#include "subgroup_common_templates.h"
|
||||||
#include "harness/conversions.h"
|
#include "harness/conversions.h"
|
||||||
#include "harness/typeWrappers.h"
|
#include "harness/typeWrappers.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
static const char* shuffle_down_source =
|
|
||||||
"__kernel void test_sub_group_shuffle_down(const __global Type *in, "
|
|
||||||
"__global int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
|
|
||||||
"}\n";
|
|
||||||
static const char* shuffle_up_source =
|
|
||||||
"__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
|
|
||||||
"int4 *xy, __global Type *out)\n"
|
|
||||||
"{\n"
|
|
||||||
" int gid = get_global_id(0);\n"
|
|
||||||
" XY(xy,gid);\n"
|
|
||||||
" Type x = in[gid];\n"
|
|
||||||
" out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
|
|
||||||
"}\n";
|
|
||||||
|
|
||||||
template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
|
template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
|
||||||
{
|
{
|
||||||
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
|
int error =
|
||||||
"test_sub_group_shuffle_up", shuffle_up_source);
|
rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>("sub_group_shuffle_up");
|
||||||
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
|
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
|
||||||
"test_sub_group_shuffle_down", shuffle_down_source);
|
"sub_group_shuffle_down");
|
||||||
return error;
|
return error;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -62,9 +44,11 @@ int test_subgroup_functions_shuffle_relative(cl_device_id device,
|
|||||||
"device, skipping test.\n");
|
"device, skipping test.\n");
|
||||||
return TEST_SKIPPED_ITSELF;
|
return TEST_SKIPPED_ITSELF;
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr size_t global_work_size = 2000;
|
constexpr size_t global_work_size = 2000;
|
||||||
constexpr size_t local_work_size = 200;
|
constexpr size_t local_work_size = 200;
|
||||||
WorkGroupParams test_params(global_work_size, local_work_size);
|
WorkGroupParams test_params(global_work_size, local_work_size);
|
||||||
|
test_params.save_kernel_source(sub_group_generic_source);
|
||||||
RunTestForType rft(device, context, queue, num_elements, test_params);
|
RunTestForType rft(device, context, queue, num_elements, test_params);
|
||||||
|
|
||||||
int error = run_shuffle_relative_for_type<cl_int>(rft);
|
int error = run_shuffle_relative_for_type<cl_int>(rft);
|
||||||
|
|||||||
Reference in New Issue
Block a user