New subgroups - full changes set (#1074)

* Extended subgroups - extended types types

* Extended subgroups - non uniform vote tests

* Extended subgroups - non uniform arithmetic tests

* Extended subgroups - ballot tests

* Extended subgroups - clustered reduce tests

* Extended subgroups - shuffle tests

* Extended subgroups - formating issues

* Extended subgroups - review fixes

* Extended subgroups - review fixes

Fixed: removed additional brakes, kernel_sstr

* Extended subgroups - fix macos build error

* Extended subgroups - review fixes

Fixed: mac os build error

* Extended subgroups - data type verification example

* Extended subgroups - error unification

* Extended subgroups - fix header years

* Extended subgroups - use is_half_nan

* Extended subgroups - compare half as float

* Review fixes mostly for ballot functions.

- Modify kernels for better handling active/inactive workitems
- Modify gen/chk functions for handling non uniform workgroup sizes
- Introduce new variables naming convention
- minor fixes

* Extended subgroups - simplification data generation for ballot lsb/msb functions

* Extended subgroups - minor fixes

* Extended subgroups - move common code to function

* Extended subgroups - formatting errors fix

* Extended subgroups - fix build error

* Extended subgroups - sub_group_elect more sophisticated

Define mask which is 4bytes pattern where bit 1 means work item is active.
If workitem in subgroup matches pattern then run sub_group_elect()

* Extended subgroups - fix Ubuntu build error

* Extended subgroups - voting function review fixes

* adjust all function for using masks
* remove calculate templates
* merge code to one common template
* check results only in active workitems
* normalize values on host side
* minor fixes

* Extended subgroups - fix typos

* Set of fixes and improvements after review

* define WorkGroupParams to stop extended parameters list in function
* better workitems mask handing (WorkGroupParams)
* narrow values of data input generation to avoid overflows (arithmetic func)
* implement work item masks for arithmetic functions
* enable half type testing for reduction/scan/broadcast
* minor fixes

* Extended subgroups - fix Linux issues

* Extended subgroups - fix sub_group_local_id data type

* Extended subgroups - use vector instead of array.

* Extended subgroups - change names to subgroup

* Extended subgroups - uncomment code, fix build

* Extended subgroups - build fix, use cl_half_from_float func

* Extended subgroups - remove is_half_nan

* Extended subgroups - do no use undef min/max

* Extended subgroups - use parenthesis, fix formatting
This commit is contained in:
Grzegorz Wawiorko
2021-04-06 18:25:48 +02:00
committed by GitHub
parent c5e4ca6c91
commit 71bef8563e
18 changed files with 5075 additions and 957 deletions

View File

@@ -5,8 +5,16 @@ set(${MODULE_NAME}_SOURCES
test_barrier.cpp test_barrier.cpp
test_queries.cpp test_queries.cpp
test_workitem.cpp test_workitem.cpp
test_workgroup.cpp test_subgroup.cpp
test_ifp.cpp test_ifp.cpp
test_subgroup_extended_types.cpp
subgroup_common_kernels.cpp
test_subgroup_non_uniform_vote.cpp
test_subgroup_non_uniform_arithmetic.cpp
test_subgroup_ballot.cpp
test_subgroup_clustered_reduce.cpp
test_subgroup_shuffle.cpp
test_subgroup_shuffle_relative.cpp
) )
include(../CMakeCommon.txt) include(../CMakeCommon.txt)

View File

@@ -27,12 +27,19 @@ test_definition test_list[] = {
ADD_TEST_VERSION(sub_group_info_core, Version(2, 1)), ADD_TEST_VERSION(sub_group_info_core, Version(2, 1)),
ADD_TEST_VERSION(work_item_functions_ext, Version(2, 0)), ADD_TEST_VERSION(work_item_functions_ext, Version(2, 0)),
ADD_TEST_VERSION(work_item_functions_core, Version(2, 1)), ADD_TEST_VERSION(work_item_functions_core, Version(2, 1)),
ADD_TEST_VERSION(work_group_functions_ext, Version(2, 0)), ADD_TEST_VERSION(subgroup_functions_ext, Version(2, 0)),
ADD_TEST_VERSION(work_group_functions_core, Version(2, 1)), ADD_TEST_VERSION(subgroup_functions_core, Version(2, 1)),
ADD_TEST_VERSION(barrier_functions_ext, Version(2, 0)), ADD_TEST_VERSION(barrier_functions_ext, Version(2, 0)),
ADD_TEST_VERSION(barrier_functions_core, Version(2, 1)), ADD_TEST_VERSION(barrier_functions_core, Version(2, 1)),
ADD_TEST_VERSION(ifp_ext, Version(2, 0)), ADD_TEST_VERSION(ifp_ext, Version(2, 0)),
ADD_TEST_VERSION(ifp_core, Version(2, 1)) ADD_TEST_VERSION(ifp_core, Version(2, 1)),
ADD_TEST(subgroup_functions_extended_types),
ADD_TEST(subgroup_functions_non_uniform_vote),
ADD_TEST(subgroup_functions_non_uniform_arithmetic),
ADD_TEST(subgroup_functions_ballot),
ADD_TEST(subgroup_functions_clustered_reduce),
ADD_TEST(subgroup_functions_shuffle),
ADD_TEST(subgroup_functions_shuffle_relative)
}; };
const int test_num = ARRAY_SIZE(test_list); const int test_num = ARRAY_SIZE(test_list);

View File

@@ -37,14 +37,12 @@ extern int test_work_item_functions_core(cl_device_id device,
cl_context context, cl_context context,
cl_command_queue queue, cl_command_queue queue,
int num_elements); int num_elements);
extern int test_work_group_functions_ext(cl_device_id device, extern int test_subgroup_functions_ext(cl_device_id device, cl_context context,
cl_context context, cl_command_queue queue,
cl_command_queue queue, int num_elements);
int num_elements); extern int test_subgroup_functions_core(cl_device_id device, cl_context context,
extern int test_work_group_functions_core(cl_device_id device, cl_command_queue queue,
cl_context context, int num_elements);
cl_command_queue queue,
int num_elements);
extern int test_barrier_functions_ext(cl_device_id device, cl_context context, extern int test_barrier_functions_ext(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements); cl_command_queue queue, int num_elements);
extern int test_barrier_functions_core(cl_device_id device, cl_context context, extern int test_barrier_functions_core(cl_device_id device, cl_context context,
@@ -56,5 +54,31 @@ extern int test_ifp_ext(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements); cl_command_queue queue, int num_elements);
extern int test_ifp_core(cl_device_id device, cl_context context, extern int test_ifp_core(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements); cl_command_queue queue, int num_elements);
extern int test_subgroup_functions_extended_types(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements);
extern int test_subgroup_functions_non_uniform_vote(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements);
extern int test_subgroup_functions_non_uniform_arithmetic(
cl_device_id device, cl_context context, cl_command_queue queue,
int num_elements);
extern int test_subgroup_functions_ballot(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements);
extern int test_subgroup_functions_clustered_reduce(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements);
extern int test_subgroup_functions_shuffle(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements);
extern int test_subgroup_functions_shuffle_relative(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements);
#endif /*_procs_h*/ #endif /*_procs_h*/

View File

@@ -0,0 +1,106 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "subgroup_common_kernels.h"
const char* bcast_source =
"__kernel void test_bcast(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" uint which_sub_group_local_id = xy[gid].z;\n"
" out[gid] = sub_group_broadcast(x, which_sub_group_local_id);\n"
"}\n";
const char* redadd_source = "__kernel void test_redadd(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_add(in[gid]);\n"
"}\n";
const char* redmax_source = "__kernel void test_redmax(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_max(in[gid]);\n"
"}\n";
const char* redmin_source = "__kernel void test_redmin(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_min(in[gid]);\n"
"}\n";
const char* scinadd_source =
"__kernel void test_scinadd(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
"}\n";
const char* scinmax_source =
"__kernel void test_scinmax(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
"}\n";
const char* scinmin_source =
"__kernel void test_scinmin(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
"}\n";
const char* scexadd_source =
"__kernel void test_scexadd(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
"}\n";
const char* scexmax_source =
"__kernel void test_scexmax(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
"}\n";
const char* scexmin_source =
"__kernel void test_scexmin(const __global Type *in, __global int4 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
"}\n";

View File

@@ -0,0 +1,32 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef SUBGROUPKERNELSOURCES_H
#define SUBGROUPKERNELSOURCES_H
#include "subhelpers.h"
extern const char* bcast_source;
extern const char* redadd_source;
extern const char* redmax_source;
extern const char* redmin_source;
extern const char* scinadd_source;
extern const char* scinmax_source;
extern const char* scinmin_source;
extern const char* scexadd_source;
extern const char* scexmax_source;
extern const char* scexmin_source;
#endif

View File

@@ -0,0 +1,911 @@
//
// Copyright (c) 2020 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef SUBGROUPCOMMONTEMPLATES_H
#define SUBGROUPCOMMONTEMPLATES_H
#include "typeWrappers.h"
#include <bitset>
#include "CL/cl_half.h"
#include "subhelpers.h"
#include <set>
typedef std::bitset<128> bs128;
static cl_uint4 generate_bit_mask(cl_uint subgroup_local_id,
const std::string &mask_type,
cl_uint max_sub_group_size)
{
bs128 mask128;
cl_uint4 mask;
cl_uint pos = subgroup_local_id;
if (mask_type == "eq") mask128.set(pos);
if (mask_type == "le" || mask_type == "lt")
{
for (cl_uint i = 0; i <= pos; i++) mask128.set(i);
if (mask_type == "lt") mask128.reset(pos);
}
if (mask_type == "ge" || mask_type == "gt")
{
for (cl_uint i = pos; i < max_sub_group_size; i++) mask128.set(i);
if (mask_type == "gt") mask128.reset(pos);
}
// convert std::bitset<128> to uint4
auto const uint_mask = bs128{ static_cast<unsigned long>(-1) };
mask.s0 = (mask128 & uint_mask).to_ulong();
mask128 >>= 32;
mask.s1 = (mask128 & uint_mask).to_ulong();
mask128 >>= 32;
mask.s2 = (mask128 & uint_mask).to_ulong();
mask128 >>= 32;
mask.s3 = (mask128 & uint_mask).to_ulong();
return mask;
}
// DESCRIPTION :
// sub_group_broadcast - each work_item registers it's own value.
// All work_items in subgroup takes one value from only one (any) work_item
// sub_group_broadcast_first - same as type 0. All work_items in
// subgroup takes only one value from only one chosen (the smallest subgroup ID)
// work_item
// sub_group_non_uniform_broadcast - same as type 0 but
// only 4 work_items from subgroup enter the code (are active)
template <typename Ty, SubgroupsBroadcastOp operation> struct BC
{
static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
{
int i, ii, j, k, n;
int ng = test_params.global_workgroup_size;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int nj = (nw + ns - 1) / ns;
int d = ns > 100 ? 100 : ns;
int non_uniform_size = ng % nw;
ng = ng / nw;
int last_subgroup_size = 0;
ii = 0;
log_info(" sub_group_%s(%s)...\n", operation_names(operation),
TypeManager<Ty>::name());
if (non_uniform_size)
{
log_info(" non uniform work group size mode ON\n");
ng++;
}
for (k = 0; k < ng; ++k)
{ // for each work_group
if (non_uniform_size && k == ng - 1)
{
set_last_workgroup_params(non_uniform_size, nj, ns, nw,
last_subgroup_size);
}
for (j = 0; j < nj; ++j)
{ // for each subgroup
ii = j * ns;
if (last_subgroup_size && j == nj - 1)
{
n = last_subgroup_size;
}
else
{
n = ii + ns > nw ? nw - ii : ns;
}
int bcast_if = 0;
int bcast_elseif = 0;
int bcast_index = (int)(genrand_int32(gMTdata) & 0x7fffffff)
% (d > n ? n : d);
// l - calculate subgroup local id from which value will be
// broadcasted (one the same value for whole subgroup)
if (operation != SubgroupsBroadcastOp::broadcast)
{
// reduce brodcasting index in case of non_uniform and
// last workgroup last subgroup
if (last_subgroup_size && j == nj - 1
&& last_subgroup_size < NR_OF_ACTIVE_WORK_ITEMS)
{
bcast_if = bcast_index % last_subgroup_size;
bcast_elseif = bcast_if;
}
else
{
bcast_if = bcast_index % NR_OF_ACTIVE_WORK_ITEMS;
bcast_elseif = NR_OF_ACTIVE_WORK_ITEMS
+ bcast_index % (n - NR_OF_ACTIVE_WORK_ITEMS);
}
}
for (i = 0; i < n; ++i)
{
if (operation == SubgroupsBroadcastOp::broadcast)
{
int midx = 4 * ii + 4 * i + 2;
m[midx] = (cl_int)bcast_index;
}
else
{
if (i < NR_OF_ACTIVE_WORK_ITEMS)
{
// index of the third
// element int the vector.
int midx = 4 * ii + 4 * i + 2;
// storing information about
// broadcasting index -
// earlier calculated
m[midx] = (cl_int)bcast_if;
}
else
{ // index of the third
// element int the vector.
int midx = 4 * ii + 4 * i + 3;
m[midx] = (cl_int)bcast_elseif;
}
}
// calculate value for broadcasting
cl_ulong number = genrand_int64(gMTdata);
set_value(t[ii + i], number);
}
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{ // for each element in work_group
// calculate index as number of subgroup
// plus subgroup local id
x[j] = t[j];
}
x += nw;
m += 4 * nw;
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, l, n;
int ng = test_params.global_workgroup_size;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
int non_uniform_size = ng % nw;
ng = ng / nw;
int last_subgroup_size = 0;
if (non_uniform_size) ng++;
for (k = 0; k < ng; ++k)
{ // for each work_group
if (non_uniform_size && k == ng - 1)
{
set_last_workgroup_params(non_uniform_size, nj, ns, nw,
last_subgroup_size);
}
for (j = 0; j < nw; ++j)
{ // inside the work_group
mx[j] = x[j]; // read host inputs for work_group
my[j] = y[j]; // read device outputs for work_group
}
for (j = 0; j < nj; ++j)
{ // for each subgroup
ii = j * ns;
if (last_subgroup_size && j == nj - 1)
{
n = last_subgroup_size;
}
else
{
n = ii + ns > nw ? nw - ii : ns;
}
// Check result
if (operation == SubgroupsBroadcastOp::broadcast_first)
{
int lowest_active_id = -1;
for (i = 0; i < n; ++i)
{
lowest_active_id = i < NR_OF_ACTIVE_WORK_ITEMS
? 0
: NR_OF_ACTIVE_WORK_ITEMS;
// findout if broadcasted
// value is the same
tr = mx[ii + lowest_active_id];
// findout if broadcasted to all
rr = my[ii + i];
if (!compare(rr, tr))
{
log_error(
"ERROR: sub_group_broadcast_first(%s) "
"mismatch "
"for local id %d in sub group %d in group "
"%d\n",
TypeManager<Ty>::name(), i, j, k);
return TEST_FAIL;
}
}
}
else
{
for (i = 0; i < n; ++i)
{
if (operation == SubgroupsBroadcastOp::broadcast)
{
int midx = 4 * ii + 4 * i + 2;
l = (int)m[midx];
tr = mx[ii + l];
}
else
{
if (i < NR_OF_ACTIVE_WORK_ITEMS)
{ // take index of array where info
// which work_item will be
// broadcast its value is stored
int midx = 4 * ii + 4 * i + 2;
// take subgroup local id of
// this work_item
l = (int)m[midx];
// take value generated on host
// for this work_item
tr = mx[ii + l];
}
else
{
int midx = 4 * ii + 4 * i + 3;
l = (int)m[midx];
tr = mx[ii + l];
}
}
rr = my[ii + i]; // read device outputs for
// work_item in the subgroup
if (!compare(rr, tr))
{
log_error("ERROR: sub_group_%s(%s) "
"mismatch for local id %d in sub "
"group %d in group %d - got %lu "
"expected %lu\n",
operation_names(operation),
TypeManager<Ty>::name(), i, j, k, rr, tr);
return TEST_FAIL;
}
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" sub_group_%s(%s)... passed\n", operation_names(operation),
TypeManager<Ty>::name());
return TEST_PASS;
}
};
static float to_float(subgroups::cl_half x) { return cl_half_to_float(x.data); }
static subgroups::cl_half to_half(float x)
{
subgroups::cl_half value;
value.data = cl_half_from_float(x, CL_HALF_RTE);
return value;
}
// for integer types
template <typename Ty> inline Ty calculate(Ty a, Ty b, ArithmeticOp operation)
{
switch (operation)
{
case ArithmeticOp::add_: return a + b;
case ArithmeticOp::max_: return a > b ? a : b;
case ArithmeticOp::min_: return a < b ? a : b;
case ArithmeticOp::mul_: return a * b;
case ArithmeticOp::and_: return a & b;
case ArithmeticOp::or_: return a | b;
case ArithmeticOp::xor_: return a ^ b;
case ArithmeticOp::logical_and: return a && b;
case ArithmeticOp::logical_or: return a || b;
case ArithmeticOp::logical_xor: return !a ^ !b;
default: log_error("Unknown operation request"); break;
}
return 0;
}
// Specialize for floating points.
template <>
inline cl_double calculate(cl_double a, cl_double b, ArithmeticOp operation)
{
switch (operation)
{
case ArithmeticOp::add_: {
return a + b;
}
case ArithmeticOp::max_: {
return a > b ? a : b;
}
case ArithmeticOp::min_: {
return a < b ? a : b;
}
case ArithmeticOp::mul_: {
return a * b;
}
default: log_error("Unknown operation request"); break;
}
return 0;
}
template <>
inline cl_float calculate(cl_float a, cl_float b, ArithmeticOp operation)
{
switch (operation)
{
case ArithmeticOp::add_: {
return a + b;
}
case ArithmeticOp::max_: {
return a > b ? a : b;
}
case ArithmeticOp::min_: {
return a < b ? a : b;
}
case ArithmeticOp::mul_: {
return a * b;
}
default: log_error("Unknown operation request"); break;
}
return 0;
}
template <>
inline subgroups::cl_half calculate(subgroups::cl_half a, subgroups::cl_half b,
ArithmeticOp operation)
{
switch (operation)
{
case ArithmeticOp::add_: return to_half(to_float(a) + to_float(b));
case ArithmeticOp::max_:
return to_float(a) > to_float(b) || is_half_nan(b.data) ? a : b;
case ArithmeticOp::min_:
return to_float(a) < to_float(b) || is_half_nan(b.data) ? a : b;
case ArithmeticOp::mul_: return to_half(to_float(a) * to_float(b));
default: log_error("Unknown operation request"); break;
}
return to_half(0);
}
template <typename Ty> bool is_floating_point()
{
return std::is_floating_point<Ty>::value
|| std::is_same<Ty, subgroups::cl_half>::value;
}
template <typename Ty, ArithmeticOp operation>
void genrand(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
{
int nj = (nw + ns - 1) / ns;
for (int k = 0; k < ng; ++k)
{
for (int j = 0; j < nj; ++j)
{
int ii = j * ns;
int n = ii + ns > nw ? nw - ii : ns;
for (int i = 0; i < n; ++i)
{
cl_ulong out_value;
double y;
if (operation == ArithmeticOp::mul_
|| operation == ArithmeticOp::add_)
{
// work around to avoid overflow, do not use 0 for
// multiplication
out_value = (genrand_int32(gMTdata) % 4) + 1;
}
else
{
out_value = genrand_int64(gMTdata) % (32 * n);
if ((operation == ArithmeticOp::logical_and
|| operation == ArithmeticOp::logical_or
|| operation == ArithmeticOp::logical_xor)
&& ((out_value >> 32) & 1) == 0)
out_value = 0; // increase probability of false
}
set_value(t[ii + i], out_value);
}
}
// Now map into work group using map from device
for (int j = 0; j < nw; ++j)
{
x[j] = t[j];
}
x += nw;
m += 4 * nw;
}
}
template <typename Ty, ShuffleOp operation> struct SHF
{
static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
{
int i, ii, j, k, l, n, delta;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns;
int d = ns > 100 ? 100 : ns;
ii = 0;
ng = ng / nw;
log_info(" sub_group_%s(%s)...\n", operation_names(operation),
TypeManager<Ty>::name());
for (k = 0; k < ng; ++k)
{ // for each work_group
for (j = 0; j < nj; ++j)
{ // for each subgroup
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
for (i = 0; i < n; ++i)
{
int midx = 4 * ii + 4 * i + 2;
l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
% (d > n ? n : d);
switch (operation)
{
case ShuffleOp::shuffle:
case ShuffleOp::shuffle_xor:
// storing information about shuffle index
m[midx] = (cl_int)l;
break;
case ShuffleOp::shuffle_up:
delta = l; // calculate delta for shuffle up
if (i - delta < 0)
{
delta = i;
}
m[midx] = (cl_int)delta;
break;
case ShuffleOp::shuffle_down:
delta = l; // calculate delta for shuffle down
if (i + delta >= n)
{
delta = n - 1 - i;
}
m[midx] = (cl_int)delta;
break;
default: break;
}
cl_ulong number = genrand_int64(gMTdata);
set_value(t[ii + i], number);
}
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{ // for each element in work_group
x[j] = t[j];
}
x += nw;
m += 4 * nw;
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, l, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
ng = ng / nw;
for (k = 0; k < ng; ++k)
{ // for each work_group
for (j = 0; j < nw; ++j)
{ // inside the work_group
mx[j] = x[j]; // read host inputs for work_group
my[j] = y[j]; // read device outputs for work_group
}
for (j = 0; j < nj; ++j)
{ // for each subgroup
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
for (i = 0; i < n; ++i)
{ // inside the subgroup
// shuffle index storage
int midx = 4 * ii + 4 * i + 2;
l = (int)m[midx];
rr = my[ii + i];
switch (operation)
{
// shuffle basic - treat l as index
case ShuffleOp::shuffle: tr = mx[ii + l]; break;
// shuffle up - treat l as delta
case ShuffleOp::shuffle_up: tr = mx[ii + i - l]; break;
// shuffle up - treat l as delta
case ShuffleOp::shuffle_down:
tr = mx[ii + i + l];
break;
// shuffle xor - treat l as mask
case ShuffleOp::shuffle_xor:
tr = mx[ii + (i ^ l)];
break;
default: break;
}
if (!compare(rr, tr))
{
log_error("ERROR: sub_group_%s(%s) mismatch for "
"local id %d in sub group %d in group %d\n",
operation_names(operation),
TypeManager<Ty>::name(), i, j, k);
return TEST_FAIL;
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" sub_group_%s(%s)... passed\n", operation_names(operation),
TypeManager<Ty>::name());
return TEST_PASS;
}
};
template <typename Ty, ArithmeticOp operation> struct SCEX_NU
{
static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
{
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
: func_name = "sub_group_scan_exclusive";
log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n",
test_params.global_workgroup_size, nw, ns, work_items_mask);
genrand<Ty, operation>(x, t, m, ns, nw, ng);
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_exclusive"
: func_name = "sub_group_scan_exclusive";
uint32_t use_work_items_mask;
// for uniform case take into consideration all workitems
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
for (k = 0; k < ng; ++k)
{ // for each work_group
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{ // inside the work_group
mx[j] = x[j]; // read host inputs for work_group
my[j] = y[j]; // read device outputs for work_group
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
std::set<int> active_work_items;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (use_work_items_mask & check_work_item)
{
active_work_items.insert(i);
}
}
if (active_work_items.empty())
{
log_info(" No acitve workitems in workgroup id = %d "
"subgroup id = %d - no calculation\n",
k, j);
continue;
}
else if (active_work_items.size() == 1)
{
log_info(" One active workitem in workgroup id = %d "
"subgroup id = %d - no calculation\n",
k, j);
continue;
}
else
{
tr = TypeManager<Ty>::identify_limits(operation);
int idx = 0;
for (const int &active_work_item : active_work_items)
{
rr = my[ii + active_work_item];
if (idx == 0) continue;
if (!compare_ordered(rr, tr))
{
log_error(
"ERROR: %s_%s(%s) "
"mismatch for local id %d in sub group %d in "
"group %d Expected: %d Obtained: %d\n",
func_name.c_str(), operation_names(operation),
TypeManager<Ty>::name(), i, j, k, tr, rr);
return TEST_FAIL;
}
tr = calculate<Ty>(tr, mx[ii + active_work_item],
operation);
idx++;
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" %s_%s(%s)... passed\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
return TEST_PASS;
}
};
// Test for scan inclusive non uniform functions
template <typename Ty, ArithmeticOp operation> struct SCIN_NU
{
static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
{
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
: func_name = "sub_group_scan_inclusive";
genrand<Ty, operation>(x, t, m, ns, nw, ng);
log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n",
test_params.global_workgroup_size, nw, ns, work_items_mask);
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_scan_inclusive"
: func_name = "sub_group_scan_inclusive";
uint32_t use_work_items_mask;
// for uniform case take into consideration all workitems
use_work_items_mask = !work_items_mask ? 0xFFFFFFFF : work_items_mask;
// std::bitset<32> mask32(use_work_items_mask);
// for (int k) mask32.count();
for (k = 0; k < ng; ++k)
{ // for each work_group
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{ // inside the work_group
mx[j] = x[j]; // read host inputs for work_group
my[j] = y[j]; // read device outputs for work_group
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
std::set<int> active_work_items;
int catch_frist_active = -1;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (use_work_items_mask & check_work_item)
{
if (catch_frist_active == -1)
{
catch_frist_active = i;
}
active_work_items.insert(i);
}
}
if (active_work_items.empty())
{
log_info(" No acitve workitems in workgroup id = %d "
"subgroup id = %d - no calculation\n",
k, j);
continue;
}
else
{
tr = TypeManager<Ty>::identify_limits(operation);
for (const int &active_work_item : active_work_items)
{
rr = my[ii + active_work_item];
if (active_work_items.size() == 1)
{
tr = mx[ii + catch_frist_active];
}
else
{
tr = calculate<Ty>(tr, mx[ii + active_work_item],
operation);
}
if (!compare_ordered<Ty>(rr, tr))
{
log_error(
"ERROR: %s_%s(%s) "
"mismatch for local id %d in sub group %d "
"in "
"group %d Expected: %d Obtained: %d\n",
func_name.c_str(), operation_names(operation),
TypeManager<Ty>::name(), active_work_item, j, k,
tr, rr);
return TEST_FAIL;
}
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" %s_%s(%s)... passed\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
return TEST_PASS;
}
};
// Test for reduce non uniform functions
template <typename Ty, ArithmeticOp operation> struct RED_NU
{
static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
{
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
ng = ng / nw;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_reduce"
: func_name = "sub_group_reduce";
log_info(" %s_%s(%s)...\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x \n",
test_params.global_workgroup_size, nw, ns, work_items_mask);
genrand<Ty, operation>(x, t, m, ns, nw, ng);
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
ng = ng / nw;
Ty tr, rr;
std::string func_name;
work_items_mask ? func_name = "sub_group_non_uniform_reduce"
: func_name = "sub_group_reduce";
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub
// group
for (j = 0; j < nw; ++j)
{
mx[j] = x[j];
my[j] = y[j];
}
uint32_t use_work_items_mask;
use_work_items_mask =
!work_items_mask ? 0xFFFFFFFF : work_items_mask;
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
std::set<int> active_work_items;
int catch_frist_active = -1;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (use_work_items_mask & check_work_item)
{
if (catch_frist_active == -1)
{
catch_frist_active = i;
tr = mx[ii + i];
active_work_items.insert(i);
continue;
}
active_work_items.insert(i);
tr = calculate<Ty>(tr, mx[ii + i], operation);
}
}
if (active_work_items.empty())
{
log_info(" No acitve workitems in workgroup id = %d "
"subgroup id = %d - no calculation\n",
k, j);
continue;
}
for (const int &active_work_item : active_work_items)
{
rr = my[ii + active_work_item];
if (!compare_ordered<Ty>(rr, tr))
{
log_error("ERROR: %s_%s(%s) "
"mismatch for local id %d in sub group %d in "
"group %d Expected: %d Obtained: %d\n",
func_name.c_str(), operation_names(operation),
TypeManager<Ty>::name(), active_work_item, j,
k, tr, rr);
return TEST_FAIL;
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" %s_%s(%s)... passed\n", func_name.c_str(),
operation_names(operation), TypeManager<Ty>::name());
return TEST_PASS;
}
};
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -59,10 +59,15 @@ static const char *gbar_source =
// barrier test functions // barrier test functions
template <int Which> struct BAR template <int Which> struct BAR
{ {
static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng) static void gen(cl_int *x, cl_int *t, cl_int *m,
const WorkGroupParams &test_params)
{ {
int i, ii, j, k, n; int i, ii, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
ng = ng / nw;
int e; int e;
ii = 0; ii = 0;
@@ -79,8 +84,7 @@ template <int Which> struct BAR
// Now map into work group using map from device // Now map into work group using map from device
for (j = 0; j < nw; ++j) for (j = 0; j < nw; ++j)
{ {
i = m[2 * j + 1] * ns + m[2 * j]; x[j] = t[j];
x[j] = t[i];
} }
x += nw; x += nw;
@@ -89,10 +93,14 @@ template <int Which> struct BAR
} }
static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m, static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
int ns, int nw, int ng) const WorkGroupParams &test_params)
{ {
int ii, i, j, k, n; int ii, i, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
ng = ng / nw;
cl_int tr, rr; cl_int tr, rr;
if (Which == 0) if (Which == 0)
@@ -105,9 +113,8 @@ template <int Which> struct BAR
// Map to array indexed to array indexed by local ID and sub group // Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j) for (j = 0; j < nw; ++j)
{ {
i = m[2 * j + 1] * ns + m[2 * j]; mx[j] = x[j];
mx[i] = x[j]; my[j] = y[j];
my[i] = y[j];
} }
for (j = 0; j < nj; ++j) for (j = 0; j < nj; ++j)
@@ -123,8 +130,9 @@ template <int Which> struct BAR
if (tr != rr) if (tr != rr)
{ {
log_error("ERROR: sub_group_barrier mismatch for local " log_error("ERROR: sub_group_barrier mismatch for local "
"id %d in sub group %d in group %d\n", "id %d in sub group %d in group %d expected "
i, j, k); "%d got %d\n",
i, j, k, tr, rr);
return -1; return -1;
} }
} }
@@ -144,18 +152,18 @@ int test_barrier_functions(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements, cl_command_queue queue, int num_elements,
bool useCoreSubgroups) bool useCoreSubgroups)
{ {
int error; int error = TEST_PASS;
// Adjust these individually below if desired/needed // Adjust these individually below if desired/needed
#define G 2000 constexpr size_t global_work_size = 2000;
#define L 200 constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
error = test<cl_int, BAR<0>, G, L>::run(device, context, queue, test_params.use_core_subgroups = useCoreSubgroups;
num_elements, "test_lbar", error = test<cl_int, BAR<0>>::run(device, context, queue, num_elements,
lbar_source, 0, useCoreSubgroups); "test_lbar", lbar_source, test_params);
error = test<cl_int, BAR<1>, G, L, G>::run( error |= test<cl_int, BAR<1>, global_work_size>::run(
device, context, queue, num_elements, "test_gbar", gbar_source, 0, device, context, queue, num_elements, "test_gbar", gbar_source,
useCoreSubgroups); test_params);
return error; return error;
} }

View File

@@ -46,7 +46,7 @@ static const char *ifp_source =
"#define INST_COUNT 0x3\n" "#define INST_COUNT 0x3\n"
"\n" "\n"
"__kernel void\n" "__kernel void\n"
"test_ifp(const __global int *in, __global int2 *xy, __global int *out)\n" "test_ifp(const __global int *in, __global int4 *xy, __global int *out)\n"
"{\n" "{\n"
" __local atomic_int loc[NUM_LOC];\n" " __local atomic_int loc[NUM_LOC];\n"
"\n" "\n"
@@ -225,10 +225,15 @@ void run_insts(cl_int *x, cl_int *p, int n)
struct IFP struct IFP
{ {
static void gen(cl_int *x, cl_int *t, cl_int *, int ns, int nw, int ng) static void gen(cl_int *x, cl_int *t, cl_int *,
const WorkGroupParams &test_params)
{ {
int k; int k;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
ng = ng / nw;
// We need at least 2 sub groups per group for this test // We need at least 2 sub groups per group for this test
if (nj == 1) return; if (nj == 1) return;
@@ -240,11 +245,15 @@ struct IFP
} }
} }
static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *, int ns, static int chk(cl_int *x, cl_int *y, cl_int *t, cl_int *, cl_int *,
int nw, int ng) const WorkGroupParams &test_params)
{ {
int i, k; int i, k;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns; int nj = (nw + ns - 1) / ns;
ng = ng / nw;
// We need at least 2 sub groups per group for this tes // We need at least 2 sub groups per group for this tes
if (nj == 1) return 0; if (nj == 1) return 0;
@@ -275,14 +284,17 @@ struct IFP
int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue, int test_ifp(cl_device_id device, cl_context context, cl_command_queue queue,
int num_elements, bool useCoreSubgroups) int num_elements, bool useCoreSubgroups)
{ {
int error; int error = TEST_PASS;
// Global/local work group sizes
// Adjust these individually below if desired/needed // Adjust these individually below if desired/needed
#define G 2000 constexpr size_t global_work_size = 2000;
#define L 200 constexpr size_t local_work_size = 200;
error = test<cl_int, IFP, G, L>::run(device, context, queue, num_elements, WorkGroupParams test_params(global_work_size, local_work_size);
"test_ifp", ifp_source, NUM_LOC + 1, test_params.use_core_subgroups = useCoreSubgroups;
useCoreSubgroups); test_params.dynsc = NUM_LOC + 1;
error = test<cl_int, IFP>::run(device, context, queue, num_elements,
"test_ifp", ifp_source, test_params);
return error; return error;
} }

View File

@@ -0,0 +1,217 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_kernels.h"
#include "subgroup_common_templates.h"
#include "harness/conversions.h"
#include "harness/typeWrappers.h"
namespace {
// Any/All test functions
template <NonUniformVoteOp operation> struct AA
{
static void gen(cl_int *x, cl_int *t, cl_int *m,
const WorkGroupParams &test_params)
{
int i, ii, j, k, n;
int ng = test_params.global_workgroup_size;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int nj = (nw + ns - 1) / ns;
int e;
ng = ng / nw;
ii = 0;
log_info(" sub_group_%s...\n", operation_names(operation));
for (k = 0; k < ng; ++k)
{
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
e = (int)(genrand_int32(gMTdata) % 3);
// Initialize data matrix indexed by local id and sub group id
switch (e)
{
case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
case 1:
memset(&t[ii], 0, n * sizeof(cl_int));
i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
t[ii + i] = 41;
break;
case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
}
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
x[j] = t[j];
}
x += nw;
m += 4 * nw;
}
}
static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, n;
int ng = test_params.global_workgroup_size;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int nj = (nw + ns - 1) / ns;
cl_int taa, raa;
ng = ng / nw;
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{
mx[j] = x[j];
my[j] = y[j];
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
// Compute target
if (operation == NonUniformVoteOp::any)
{
taa = 0;
for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
}
if (operation == NonUniformVoteOp::all)
{
taa = 1;
for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
}
// Check result
for (i = 0; i < n; ++i)
{
raa = my[ii + i] != 0;
if (raa != taa)
{
log_error("ERROR: sub_group_%s mismatch for local id "
"%d in sub group %d in group %d\n",
operation_names(operation), i, j, k);
return TEST_FAIL;
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" sub_group_%s... passed\n", operation_names(operation));
return TEST_PASS;
}
};
static const char *any_source = "__kernel void test_any(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_any(in[gid]);\n"
"}\n";
static const char *all_source = "__kernel void test_all(const __global Type "
"*in, __global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_all(in[gid]);\n"
"}\n";
template <typename T>
int run_broadcast_scan_reduction_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
"test_bcast", bcast_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
redadd_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
redmax_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
redmin_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
scinadd_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
scinmax_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
scinmin_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
scexadd_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
scexmax_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
scexmin_source);
return error;
}
}
// Entry point from main
int test_subgroup_functions(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements,
bool useCoreSubgroups)
{
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error =
rft.run_impl<cl_int, AA<NonUniformVoteOp::any>>("test_any", any_source);
error |=
rft.run_impl<cl_int, AA<NonUniformVoteOp::all>>("test_all", all_source);
error |= run_broadcast_scan_reduction_for_type<cl_int>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_uint>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_long>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_ulong>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_float>(rft);
error |= run_broadcast_scan_reduction_for_type<cl_double>(rft);
error |= run_broadcast_scan_reduction_for_type<subgroups::cl_half>(rft);
return error;
}
int test_subgroup_functions_core(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements)
{
return test_subgroup_functions(device, context, queue, num_elements, true);
}
int test_subgroup_functions_ext(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements)
{
bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
if (!hasExtension)
{
log_info(
"Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
return TEST_SKIPPED_ITSELF;
}
return test_subgroup_functions(device, context, queue, num_elements, false);
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,340 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_templates.h"
#include "harness/typeWrappers.h"
#define CLUSTER_SIZE 4
#define CLUSTER_SIZE_STR "4"
namespace {
static const char *redadd_clustered_source =
"__kernel void test_redadd_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_add(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmax_clustered_source =
"__kernel void test_redmax_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_max(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmin_clustered_source =
"__kernel void test_redmin_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_min(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redmul_clustered_source =
"__kernel void test_redmul_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_mul(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redand_clustered_source =
"__kernel void test_redand_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_and(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redor_clustered_source =
"__kernel void test_redor_clustered(const __global Type *in, __global int4 "
"*xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_or(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redxor_clustered_source =
"__kernel void test_redxor_clustered(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR ")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = sub_group_clustered_reduce_xor(in[gid], " CLUSTER_SIZE_STR
");\n"
"}\n";
static const char *redand_clustered_logical_source =
"__kernel void test_redand_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_and(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
static const char *redor_clustered_logical_source =
"__kernel void test_redor_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if (sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_or(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
static const char *redxor_clustered_logical_source =
"__kernel void test_redxor_clustered_logical(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" xy[gid].w = 0;\n"
" if ( sizeof(in[gid]) == "
"sizeof(sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR
")))\n"
" {xy[gid].w = sizeof(in[gid]);}\n"
" out[gid] = "
"sub_group_clustered_reduce_logical_xor(in[gid], " CLUSTER_SIZE_STR ");\n"
"}\n";
// DESCRIPTION:
// Test for reduce cluster functions
template <typename Ty, ArithmeticOp operation> struct RED_CLU
{
static void gen(Ty *x, Ty *t, cl_int *m, const WorkGroupParams &test_params)
{
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
ng = ng / nw;
log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ...\n",
operation_names(operation), TypeManager<Ty>::name(),
sizeof(Ty));
genrand<Ty, operation>(x, t, m, ns, nw, ng);
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m,
const WorkGroupParams &test_params)
{
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
int nj = (nw + ns - 1) / ns;
ng = ng / nw;
for (int k = 0; k < ng; ++k)
{
std::vector<cl_int> data_type_sizes;
// Map to array indexed to array indexed by local ID and sub group
for (int j = 0; j < nw; ++j)
{
mx[j] = x[j];
my[j] = y[j];
data_type_sizes.push_back(m[4 * j + 3]);
}
for (cl_int dts : data_type_sizes)
{
if (dts != sizeof(Ty))
{
log_error("ERROR: sub_group_clustered_reduce_%s(%s) "
"wrong data type size detected, expected: %d, "
"used by device %d, in group %d\n",
operation_names(operation),
TypeManager<Ty>::name(), sizeof(Ty), dts, k);
return TEST_FAIL;
}
}
for (int j = 0; j < nj; ++j)
{
int ii = j * ns;
int n = ii + ns > nw ? nw - ii : ns;
int midx = 4 * ii + 2;
std::vector<Ty> clusters_results;
int clusters_counter = ns / CLUSTER_SIZE;
clusters_results.resize(clusters_counter);
// Compute target
Ty tr = mx[ii];
for (int i = 0; i < n; ++i)
{
if (i % CLUSTER_SIZE == 0)
tr = mx[ii + i];
else
tr = calculate<Ty>(tr, mx[ii + i], operation);
clusters_results[i / CLUSTER_SIZE] = tr;
}
// Check result
for (int i = 0; i < n; ++i)
{
Ty rr = my[ii + i];
tr = clusters_results[i / CLUSTER_SIZE];
if (!compare(rr, tr))
{
log_error(
"ERROR: sub_group_clustered_reduce_%s(%s) mismatch "
"for local id %d in sub group %d in group %d\n",
operation_names(operation), TypeManager<Ty>::name(),
i, j, k);
return TEST_FAIL;
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" sub_group_clustered_reduce_%s(%s, %d bytes) ... passed\n",
operation_names(operation), TypeManager<Ty>::name(),
sizeof(Ty));
return TEST_PASS;
}
};
template <typename T>
int run_cluster_red_add_max_min_mul_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::add_>>(
"test_redadd_clustered", redadd_clustered_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::max_>>(
"test_redmax_clustered", redmax_clustered_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::min_>>(
"test_redmin_clustered", redmin_clustered_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::mul_>>(
"test_redmul_clustered", redmul_clustered_source);
return error;
}
template <typename T> int run_cluster_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::and_>>(
"test_redand_clustered", redand_clustered_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::or_>>(
"test_redor_clustered", redor_clustered_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::xor_>>(
"test_redxor_clustered", redxor_clustered_source);
return error;
}
template <typename T>
int run_cluster_logical_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_and>>(
"test_redand_clustered_logical", redand_clustered_logical_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_or>>(
"test_redor_clustered_logical", redor_clustered_logical_source);
error |= rft.run_impl<T, RED_CLU<T, ArithmeticOp::logical_xor>>(
"test_redxor_clustered_logical", redxor_clustered_logical_source);
return error;
}
}
int test_subgroup_functions_clustered_reduce(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements)
{
std::vector<std::string> required_extensions = {
"cl_khr_subgroup_clustered_reduce"
};
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size,
required_extensions);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_cluster_red_add_max_min_mul_for_type<cl_int>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_uint>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_long>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_ulong>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_short>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_ushort>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_char>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_uchar>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_float>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<cl_double>(rft);
error |= run_cluster_red_add_max_min_mul_for_type<subgroups::cl_half>(rft);
error |= run_cluster_and_or_xor_for_type<cl_int>(rft);
error |= run_cluster_and_or_xor_for_type<cl_uint>(rft);
error |= run_cluster_and_or_xor_for_type<cl_long>(rft);
error |= run_cluster_and_or_xor_for_type<cl_ulong>(rft);
error |= run_cluster_and_or_xor_for_type<cl_short>(rft);
error |= run_cluster_and_or_xor_for_type<cl_ushort>(rft);
error |= run_cluster_and_or_xor_for_type<cl_char>(rft);
error |= run_cluster_and_or_xor_for_type<cl_uchar>(rft);
error |= run_cluster_logical_and_or_xor_for_type<cl_int>(rft);
return error;
}

View File

@@ -0,0 +1,138 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_kernels.h"
#include "subgroup_common_templates.h"
#include "harness/typeWrappers.h"
namespace {
template <typename T> int run_broadcast_for_extended_type(RunTestForType rft)
{
int error = rft.run_impl<T, BC<T, SubgroupsBroadcastOp::broadcast>>(
"test_bcast", bcast_source);
return error;
}
template <typename T> int run_scan_reduction_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>("test_redadd",
redadd_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>("test_redmax",
redmax_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>("test_redmin",
redmin_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>("test_scinadd",
scinadd_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>("test_scinmax",
scinmax_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>("test_scinmin",
scinmin_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>("test_scexadd",
scexadd_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>("test_scexmax",
scexmax_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>("test_scexmin",
scexmin_source);
return error;
}
}
int test_subgroup_functions_extended_types(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements)
{
std::vector<std::string> required_extensions = {
"cl_khr_subgroup_extended_types"
};
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size,
required_extensions);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_broadcast_for_extended_type<cl_uint2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_uint3>(rft);
error |= run_broadcast_for_extended_type<cl_uint4>(rft);
error |= run_broadcast_for_extended_type<cl_uint8>(rft);
error |= run_broadcast_for_extended_type<cl_uint16>(rft);
error |= run_broadcast_for_extended_type<cl_int2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_int3>(rft);
error |= run_broadcast_for_extended_type<cl_int4>(rft);
error |= run_broadcast_for_extended_type<cl_int8>(rft);
error |= run_broadcast_for_extended_type<cl_int16>(rft);
error |= run_broadcast_for_extended_type<cl_ulong2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_ulong3>(rft);
error |= run_broadcast_for_extended_type<cl_ulong4>(rft);
error |= run_broadcast_for_extended_type<cl_ulong8>(rft);
error |= run_broadcast_for_extended_type<cl_ulong16>(rft);
error |= run_broadcast_for_extended_type<cl_long2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_long3>(rft);
error |= run_broadcast_for_extended_type<cl_long4>(rft);
error |= run_broadcast_for_extended_type<cl_long8>(rft);
error |= run_broadcast_for_extended_type<cl_long16>(rft);
error |= run_broadcast_for_extended_type<cl_float2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_float3>(rft);
error |= run_broadcast_for_extended_type<cl_float4>(rft);
error |= run_broadcast_for_extended_type<cl_float8>(rft);
error |= run_broadcast_for_extended_type<cl_float16>(rft);
error |= run_broadcast_for_extended_type<cl_double2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_double3>(rft);
error |= run_broadcast_for_extended_type<cl_double4>(rft);
error |= run_broadcast_for_extended_type<cl_double8>(rft);
error |= run_broadcast_for_extended_type<cl_double16>(rft);
error |= run_broadcast_for_extended_type<cl_ushort2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_ushort3>(rft);
error |= run_broadcast_for_extended_type<cl_ushort4>(rft);
error |= run_broadcast_for_extended_type<cl_ushort8>(rft);
error |= run_broadcast_for_extended_type<cl_ushort16>(rft);
error |= run_broadcast_for_extended_type<cl_short2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_short3>(rft);
error |= run_broadcast_for_extended_type<cl_short4>(rft);
error |= run_broadcast_for_extended_type<cl_short8>(rft);
error |= run_broadcast_for_extended_type<cl_short16>(rft);
error |= run_broadcast_for_extended_type<cl_uchar2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_uchar3>(rft);
error |= run_broadcast_for_extended_type<cl_uchar4>(rft);
error |= run_broadcast_for_extended_type<cl_uchar8>(rft);
error |= run_broadcast_for_extended_type<cl_uchar16>(rft);
error |= run_broadcast_for_extended_type<cl_char2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_char3>(rft);
error |= run_broadcast_for_extended_type<cl_char4>(rft);
error |= run_broadcast_for_extended_type<cl_char8>(rft);
error |= run_broadcast_for_extended_type<cl_char16>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_half2>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_half3>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_half4>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_half8>(rft);
error |= run_broadcast_for_extended_type<subgroups::cl_half16>(rft);
error |= run_scan_reduction_for_type<cl_uchar>(rft);
error |= run_scan_reduction_for_type<cl_char>(rft);
error |= run_scan_reduction_for_type<cl_ushort>(rft);
error |= run_scan_reduction_for_type<cl_short>(rft);
return error;
}

View File

@@ -0,0 +1,473 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "harness/typeWrappers.h"
#include "subgroup_common_templates.h"
namespace {
static const char *scinadd_non_uniform_source = R"(
__kernel void test_scinadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_add(in[gid]);
}
}
)";
static const char *scinmax_non_uniform_source = R"(
__kernel void test_scinmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_max(in[gid]);
}
}
)";
static const char *scinmin_non_uniform_source = R"(
__kernel void test_scinmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_min(in[gid]);
}
}
)";
static const char *scinmul_non_uniform_source = R"(
__kernel void test_scinmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_mul(in[gid]);
}
}
)";
static const char *scinand_non_uniform_source = R"(
__kernel void test_scinand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_and(in[gid]);
}
}
)";
static const char *scinor_non_uniform_source = R"(
__kernel void test_scinor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_or(in[gid]);
}
}
)";
static const char *scinxor_non_uniform_source = R"(
__kernel void test_scinxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_xor(in[gid]);
}
}
)";
static const char *scinand_non_uniform_logical_source = R"(
__kernel void test_scinand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_and(in[gid]);
}
}
)";
static const char *scinor_non_uniform_logical_source = R"(
__kernel void test_scinor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_or(in[gid]);
}
}
)";
static const char *scinxor_non_uniform_logical_source = R"(
__kernel void test_scinxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_inclusive_logical_xor(in[gid]);
}
}
)";
static const char *scexadd_non_uniform_source = R"(
__kernel void test_scexadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_add(in[gid]);
}
}
)";
static const char *scexmax_non_uniform_source = R"(
__kernel void test_scexmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_max(in[gid]);
}
}
)";
static const char *scexmin_non_uniform_source = R"(
__kernel void test_scexmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_min(in[gid]);
}
}
)";
static const char *scexmul_non_uniform_source = R"(
__kernel void test_scexmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_mul(in[gid]);
}
}
)";
static const char *scexand_non_uniform_source = R"(
__kernel void test_scexand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_and(in[gid]);
}
}
)";
static const char *scexor_non_uniform_source = R"(
__kernel void test_scexor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_or(in[gid]);
}
}
)";
static const char *scexxor_non_uniform_source = R"(
__kernel void test_scexxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_xor(in[gid]);
}
}
)";
static const char *scexand_non_uniform_logical_source = R"(
__kernel void test_scexand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_and(in[gid]);
}
}
)";
static const char *scexor_non_uniform_logical_source = R"(
__kernel void test_scexor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_or(in[gid]);
}
}
)";
static const char *scexxor_non_uniform_logical_source = R"(
__kernel void test_scexxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_scan_exclusive_logical_xor(in[gid]);
}
}
)";
static const char *redadd_non_uniform_source = R"(
__kernel void test_redadd_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_add(in[gid]);
}
}
)";
static const char *redmax_non_uniform_source = R"(
__kernel void test_redmax_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_max(in[gid]);
}
}
)";
static const char *redmin_non_uniform_source = R"(
__kernel void test_redmin_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_min(in[gid]);
}
}
)";
static const char *redmul_non_uniform_source = R"(
__kernel void test_redmul_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_mul(in[gid]);
}
}
)";
static const char *redand_non_uniform_source = R"(
__kernel void test_redand_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_and(in[gid]);
}
}
)";
static const char *redor_non_uniform_source = R"(
__kernel void test_redor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_or(in[gid]);
}
}
)";
static const char *redxor_non_uniform_source = R"(
__kernel void test_redxor_non_uniform(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_xor(in[gid]);
}
}
)";
static const char *redand_non_uniform_logical_source = R"(
__kernel void test_redand_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_and(in[gid]);
}
}
)";
static const char *redor_non_uniform_logical_source = R"(
__kernel void test_redor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_or(in[gid]);
}
}
)";
static const char *redxor_non_uniform_logical_source = R"(
__kernel void test_redxor_non_uniform_logical(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
int elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_reduce_logical_xor(in[gid]);
}
}
)";
template <typename T>
int run_functions_add_mul_max_min_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::add_>>(
"test_scinadd_non_uniform", scinadd_non_uniform_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::mul_>>(
"test_scinmul_non_uniform", scinmul_non_uniform_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::max_>>(
"test_scinmax_non_uniform", scinmax_non_uniform_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::min_>>(
"test_scinmin_non_uniform", scinmin_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::add_>>(
"test_scexadd_non_uniform", scexadd_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::mul_>>(
"test_scexmul_non_uniform", scexmul_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::max_>>(
"test_scexmax_non_uniform", scexmax_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::min_>>(
"test_scexmin_non_uniform", scexmin_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::add_>>(
"test_redadd_non_uniform", redadd_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::mul_>>(
"test_redmul_non_uniform", redmul_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::max_>>(
"test_redmax_non_uniform", redmax_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::min_>>(
"test_redmin_non_uniform", redmin_non_uniform_source);
return error;
}
template <typename T> int run_functions_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::and_>>(
"test_scinand_non_uniform", scinand_non_uniform_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::or_>>(
"test_scinor_non_uniform", scinor_non_uniform_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::xor_>>(
"test_scinxor_non_uniform", scinxor_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::and_>>(
"test_scexand_non_uniform", scexand_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::or_>>(
"test_scexor_non_uniform", scexor_non_uniform_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::xor_>>(
"test_scexxor_non_uniform", scexxor_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::and_>>(
"test_redand_non_uniform", redand_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::or_>>(
"test_redor_non_uniform", redor_non_uniform_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::xor_>>(
"test_redxor_non_uniform", redxor_non_uniform_source);
return error;
}
template <typename T>
int run_functions_logical_and_or_xor_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_and>>(
"test_scinand_non_uniform_logical", scinand_non_uniform_logical_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_or>>(
"test_scinor_non_uniform_logical", scinor_non_uniform_logical_source);
error |= rft.run_impl<T, SCIN_NU<T, ArithmeticOp::logical_xor>>(
"test_scinxor_non_uniform_logical", scinxor_non_uniform_logical_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_and>>(
"test_scexand_non_uniform_logical", scexand_non_uniform_logical_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_or>>(
"test_scexor_non_uniform_logical", scexor_non_uniform_logical_source);
error |= rft.run_impl<T, SCEX_NU<T, ArithmeticOp::logical_xor>>(
"test_scexxor_non_uniform_logical", scexxor_non_uniform_logical_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_and>>(
"test_redand_non_uniform_logical", redand_non_uniform_logical_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_or>>(
"test_redor_non_uniform_logical", redor_non_uniform_logical_source);
error |= rft.run_impl<T, RED_NU<T, ArithmeticOp::logical_xor>>(
"test_redxor_non_uniform_logical", redxor_non_uniform_logical_source);
return error;
}
}
int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements)
{
std::vector<std::string> required_extensions = {
"cl_khr_subgroup_non_uniform_arithmetic"
};
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
0x00ffff00, 0x80000000, 0xaaaaaaaa };
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size,
required_extensions, masks);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_functions_add_mul_max_min_for_type<cl_int>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_uint>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_long>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_ulong>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_short>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_ushort>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_char>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_uchar>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_float>(rft);
error |= run_functions_add_mul_max_min_for_type<cl_double>(rft);
error |= run_functions_add_mul_max_min_for_type<subgroups::cl_half>(rft);
error |= run_functions_and_or_xor_for_type<cl_int>(rft);
error |= run_functions_and_or_xor_for_type<cl_uint>(rft);
error |= run_functions_and_or_xor_for_type<cl_long>(rft);
error |= run_functions_and_or_xor_for_type<cl_ulong>(rft);
error |= run_functions_and_or_xor_for_type<cl_short>(rft);
error |= run_functions_and_or_xor_for_type<cl_ushort>(rft);
error |= run_functions_and_or_xor_for_type<cl_char>(rft);
error |= run_functions_and_or_xor_for_type<cl_uchar>(rft);
error |= run_functions_logical_and_or_xor_for_type<cl_int>(rft);
return error;
}

View File

@@ -0,0 +1,303 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "harness/typeWrappers.h"
#include <set>
namespace {
template <typename T, NonUniformVoteOp operation> struct VOTE
{
static void gen(T *x, T *t, cl_int *m, const WorkGroupParams &test_params)
{
int i, ii, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
int non_uniform_size = ng % nw;
ng = ng / nw;
int last_subgroup_size = 0;
ii = 0;
log_info(" sub_group_%s%s... \n",
(operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
operation_names(operation));
log_info(" test params: global size = %d local size = %d subgroups "
"size = %d work item mask = 0x%x data type (%s)\n",
test_params.global_workgroup_size, nw, ns, work_items_mask,
TypeManager<T>::name());
if (non_uniform_size)
{
log_info(" non uniform work group size mode ON\n");
}
if (operation == NonUniformVoteOp::elect) return;
for (k = 0; k < ng; ++k)
{ // for each work_group
if (non_uniform_size && k == ng - 1)
{
set_last_workgroup_params(non_uniform_size, nj, ns, nw,
last_subgroup_size);
}
for (j = 0; j < nj; ++j)
{ // for each subgroup
ii = j * ns;
if (last_subgroup_size && j == nj - 1)
{
n = last_subgroup_size;
}
else
{
n = ii + ns > nw ? nw - ii : ns;
}
int e = genrand_int32(gMTdata) % 3;
for (i = 0; i < n; i++)
{
if (e == 2)
{ // set once 0 and once 1 alternately
int value = i % 2;
set_value(t[ii + i], value);
}
else
{ // set 0/1 for all work items in subgroup
set_value(t[ii + i], e);
}
}
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
x[j] = t[j];
}
x += nw;
m += 4 * nw;
}
}
static int chk(T *x, T *y, T *mx, T *my, cl_int *m,
const WorkGroupParams &test_params)
{
int ii, i, j, k, n;
int nw = test_params.local_workgroup_size;
int ns = test_params.subgroup_size;
int ng = test_params.global_workgroup_size;
uint32_t work_items_mask = test_params.work_items_mask;
int nj = (nw + ns - 1) / ns;
cl_int tr, rr;
int non_uniform_size = ng % nw;
ng = ng / nw;
if (non_uniform_size) ng++;
int last_subgroup_size = 0;
for (k = 0; k < ng; ++k)
{ // for each work_group
if (non_uniform_size && k == ng - 1)
{
set_last_workgroup_params(non_uniform_size, nj, ns, nw,
last_subgroup_size);
}
for (j = 0; j < nw; ++j)
{ // inside the work_group
mx[j] = x[j]; // read host inputs for work_group
my[j] = y[j]; // read device outputs for work_group
}
for (j = 0; j < nj; ++j)
{ // for each subgroup
ii = j * ns;
if (last_subgroup_size && j == nj - 1)
{
n = last_subgroup_size;
}
else
{
n = ii + ns > nw ? nw - ii : ns;
}
rr = 0;
if (operation == NonUniformVoteOp::all
|| operation == NonUniformVoteOp::all_equal)
tr = 1;
if (operation == NonUniformVoteOp::any) tr = 0;
std::set<int> active_work_items;
for (i = 0; i < n; ++i)
{
uint32_t check_work_item = 1 << (i % 32);
if (work_items_mask & check_work_item)
{
active_work_items.insert(i);
switch (operation)
{
case NonUniformVoteOp::elect: break;
case NonUniformVoteOp::all:
tr &=
!compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
break;
case NonUniformVoteOp::any:
tr |=
!compare_ordered<T>(mx[ii + i], 0) ? 1 : 0;
break;
case NonUniformVoteOp::all_equal:
tr &= compare_ordered<T>(
mx[ii + i],
mx[ii + *active_work_items.begin()])
? 1
: 0;
break;
default:
log_error("Unknown operation\n");
return TEST_FAIL;
}
}
}
if (active_work_items.empty())
{
log_info(" no one workitem acitve... in workgroup id = %d "
"subgroup id = %d\n",
k, j);
}
else
{
auto lowest_active = active_work_items.begin();
for (const int &active_work_item : active_work_items)
{
i = active_work_item;
if (operation == NonUniformVoteOp::elect)
{
i == *lowest_active ? tr = 1 : tr = 0;
}
// normalize device values on host, non zero set 1.
rr = compare_ordered<T>(my[ii + i], 0) ? 0 : 1;
if (rr != tr)
{
log_error("ERROR: sub_group_%s() \n",
operation_names(operation));
log_error(
"mismatch for work item %d sub group %d in "
"work group %d. Expected: %d Obtained: %d\n",
i, j, k, tr, rr);
return TEST_FAIL;
}
}
}
}
x += nw;
y += nw;
m += 4 * nw;
}
log_info(" sub_group_%s%s... passed\n",
(operation == NonUniformVoteOp::elect) ? "" : "non_uniform_",
operation_names(operation));
return TEST_PASS;
}
};
static const char *elect_source = R"(
__kernel void test_elect(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_elect();
}
}
)";
static const char *non_uniform_any_source = R"(
__kernel void test_non_uniform_any(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_any(in[gid]);
}
}
)";
static const char *non_uniform_all_source = R"(
__kernel void test_non_uniform_all(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_all(in[gid]);
}
}
)";
static const char *non_uniform_all_equal_source = R"(
__kernel void test_non_uniform_all_equal(const __global Type *in, __global int4 *xy, __global Type *out) {
int gid = get_global_id(0);
XY(xy,gid);
uint elect_work_item = 1 << (get_sub_group_local_id() % 32);
if (elect_work_item & WORK_ITEMS_MASK){
out[gid] = sub_group_non_uniform_all_equal(in[gid]);
}
}
)";
template <typename T> int run_vote_all_equal_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, VOTE<T, NonUniformVoteOp::all_equal>>(
"test_non_uniform_all_equal", non_uniform_all_equal_source);
return error;
}
}
int test_subgroup_functions_non_uniform_vote(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements)
{
std::vector<std::string> required_extensions = {
"cl_khr_subgroup_non_uniform_vote"
};
std::vector<uint32_t> masks{ 0xffffffff, 0x55aaaa55, 0x5555aaaa, 0xaaaa5555,
0x0f0ff0f0, 0x0f0f0f0f, 0xff0000ff, 0xff00ff00,
0x00ffff00, 0x80000000 };
constexpr size_t global_work_size = 170;
constexpr size_t local_work_size = 64;
WorkGroupParams test_params(global_work_size, local_work_size,
required_extensions, masks);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_vote_all_equal_for_type<cl_int>(rft);
error |= run_vote_all_equal_for_type<cl_uint>(rft);
error |= run_vote_all_equal_for_type<cl_long>(rft);
error |= run_vote_all_equal_for_type<cl_ulong>(rft);
error |= run_vote_all_equal_for_type<cl_float>(rft);
error |= run_vote_all_equal_for_type<cl_double>(rft);
error |= run_vote_all_equal_for_type<subgroups::cl_half>(rft);
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::all>>(
"test_non_uniform_all", non_uniform_all_source);
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::elect>>(
"test_elect", elect_source);
error |= rft.run_impl<cl_int, VOTE<cl_int, NonUniformVoteOp::any>>(
"test_non_uniform_any", non_uniform_any_source);
return error;
}

View File

@@ -0,0 +1,78 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_templates.h"
#include "harness/typeWrappers.h"
#include <bitset>
namespace {
static const char* shuffle_xor_source =
"__kernel void test_sub_group_shuffle_xor(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_xor(x, xy[gid].z);"
"}\n";
static const char* shuffle_source =
"__kernel void test_sub_group_shuffle(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle(x, xy[gid].z);"
"}\n";
template <typename T> int run_shuffle_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle>>(
"test_sub_group_shuffle", shuffle_source);
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_xor>>(
"test_sub_group_shuffle_xor", shuffle_xor_source);
return error;
}
}
int test_subgroup_functions_shuffle(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements)
{
std::vector<std::string> required_extensions{ "cl_khr_subgroup_shuffle" };
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size,
required_extensions);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_shuffle_for_type<cl_int>(rft);
error |= run_shuffle_for_type<cl_uint>(rft);
error |= run_shuffle_for_type<cl_long>(rft);
error |= run_shuffle_for_type<cl_ulong>(rft);
error |= run_shuffle_for_type<cl_short>(rft);
error |= run_shuffle_for_type<cl_ushort>(rft);
error |= run_shuffle_for_type<cl_char>(rft);
error |= run_shuffle_for_type<cl_uchar>(rft);
error |= run_shuffle_for_type<cl_float>(rft);
error |= run_shuffle_for_type<cl_double>(rft);
error |= run_shuffle_for_type<subgroups::cl_half>(rft);
return error;
}

View File

@@ -0,0 +1,81 @@
//
// Copyright (c) 2021 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "subgroup_common_templates.h"
#include "harness/conversions.h"
#include "harness/typeWrappers.h"
namespace {
static const char* shuffle_down_source =
"__kernel void test_sub_group_shuffle_down(const __global Type *in, "
"__global int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_down(x, xy[gid].z);"
"}\n";
static const char* shuffle_up_source =
"__kernel void test_sub_group_shuffle_up(const __global Type *in, __global "
"int4 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" out[gid] = sub_group_shuffle_up(x, xy[gid].z);"
"}\n";
template <typename T> int run_shuffle_relative_for_type(RunTestForType rft)
{
int error = rft.run_impl<T, SHF<T, ShuffleOp::shuffle_up>>(
"test_sub_group_shuffle_up", shuffle_up_source);
error |= rft.run_impl<T, SHF<T, ShuffleOp::shuffle_down>>(
"test_sub_group_shuffle_down", shuffle_down_source);
return error;
}
}
int test_subgroup_functions_shuffle_relative(cl_device_id device,
cl_context context,
cl_command_queue queue,
int num_elements)
{
std::vector<std::string> required_extensions = {
"cl_khr_subgroup_shuffle_relative"
};
constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size,
required_extensions);
RunTestForType rft(device, context, queue, num_elements, test_params);
int error = run_shuffle_relative_for_type<cl_int>(rft);
error |= run_shuffle_relative_for_type<cl_uint>(rft);
error |= run_shuffle_relative_for_type<cl_long>(rft);
error |= run_shuffle_relative_for_type<cl_ulong>(rft);
error |= run_shuffle_relative_for_type<cl_short>(rft);
error |= run_shuffle_relative_for_type<cl_ushort>(rft);
error |= run_shuffle_relative_for_type<cl_char>(rft);
error |= run_shuffle_relative_for_type<cl_uchar>(rft);
error |= run_shuffle_relative_for_type<cl_float>(rft);
error |= run_shuffle_relative_for_type<cl_double>(rft);
error |= run_shuffle_relative_for_type<subgroups::cl_half>(rft);
return error;
}

View File

@@ -1,727 +0,0 @@
//
// Copyright (c) 2017 The Khronos Group Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#include "procs.h"
#include "subhelpers.h"
#include "harness/conversions.h"
#include "harness/typeWrappers.h"
static const char *any_source = "__kernel void test_any(const __global Type "
"*in, __global int2 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_any(in[gid]);\n"
"}\n";
static const char *all_source = "__kernel void test_all(const __global Type "
"*in, __global int2 *xy, __global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_all(in[gid]);\n"
"}\n";
static const char *bcast_source =
"__kernel void test_bcast(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" Type x = in[gid];\n"
" size_t loid = (size_t)((int)x % 100);\n"
" out[gid] = sub_group_broadcast(x, loid);\n"
"}\n";
static const char *redadd_source =
"__kernel void test_redadd(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_add(in[gid]);\n"
"}\n";
static const char *redmax_source =
"__kernel void test_redmax(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_max(in[gid]);\n"
"}\n";
static const char *redmin_source =
"__kernel void test_redmin(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_reduce_min(in[gid]);\n"
"}\n";
static const char *scinadd_source =
"__kernel void test_scinadd(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_add(in[gid]);\n"
"}\n";
static const char *scinmax_source =
"__kernel void test_scinmax(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_max(in[gid]);\n"
"}\n";
static const char *scinmin_source =
"__kernel void test_scinmin(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_inclusive_min(in[gid]);\n"
"}\n";
static const char *scexadd_source =
"__kernel void test_scexadd(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_add(in[gid]);\n"
"}\n";
static const char *scexmax_source =
"__kernel void test_scexmax(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_max(in[gid]);\n"
"}\n";
static const char *scexmin_source =
"__kernel void test_scexmin(const __global Type *in, __global int2 *xy, "
"__global Type *out)\n"
"{\n"
" int gid = get_global_id(0);\n"
" XY(xy,gid);\n"
" out[gid] = sub_group_scan_exclusive_min(in[gid]);\n"
"}\n";
// Any/All test functions
template <int Which> struct AA
{
static void gen(cl_int *x, cl_int *t, cl_int *m, int ns, int nw, int ng)
{
int i, ii, j, k, n;
int nj = (nw + ns - 1) / ns;
int e;
ii = 0;
for (k = 0; k < ng; ++k)
{
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
e = (int)(genrand_int32(gMTdata) % 3);
// Initialize data matrix indexed by local id and sub group id
switch (e)
{
case 0: memset(&t[ii], 0, n * sizeof(cl_int)); break;
case 1:
memset(&t[ii], 0, n * sizeof(cl_int));
i = (int)(genrand_int32(gMTdata) % (cl_uint)n);
t[ii + i] = 41;
break;
case 2: memset(&t[ii], 0xff, n * sizeof(cl_int)); break;
}
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
x[j] = t[i];
}
x += nw;
m += 2 * nw;
}
}
static int chk(cl_int *x, cl_int *y, cl_int *mx, cl_int *my, cl_int *m,
int ns, int nw, int ng)
{
int ii, i, j, k, n;
int nj = (nw + ns - 1) / ns;
cl_int taa, raa;
log_info(" sub_group_%s...\n", Which == 0 ? "any" : "all");
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
mx[i] = x[j];
my[i] = y[j];
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
// Compute target
if (Which == 0)
{
taa = 0;
for (i = 0; i < n; ++i) taa |= mx[ii + i] != 0;
}
else
{
taa = 1;
for (i = 0; i < n; ++i) taa &= mx[ii + i] != 0;
}
// Check result
for (i = 0; i < n; ++i)
{
raa = my[ii + i] != 0;
if (raa != taa)
{
log_error("ERROR: sub_group_%s mismatch for local id "
"%d in sub group %d in group %d\n",
Which == 0 ? "any" : "all", i, j, k);
return -1;
}
}
}
x += nw;
y += nw;
m += 2 * nw;
}
return 0;
}
};
// Reduce functions
template <typename Ty, int Which> struct RED
{
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
{
int i, ii, j, k, n;
int nj = (nw + ns - 1) / ns;
ii = 0;
for (k = 0; k < ng; ++k)
{
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
for (i = 0; i < n; ++i)
t[ii + i] = (Ty)(
(int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
x[j] = t[i];
}
x += nw;
m += 2 * nw;
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
int ng)
{
int ii, i, j, k, n;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
log_info(" sub_group_reduce_%s(%s)...\n",
Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
TypeName<Ty>::val());
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
mx[i] = x[j];
my[i] = y[j];
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
// Compute target
if (Which == 0)
{
// add
tr = mx[ii];
for (i = 1; i < n; ++i) tr += mx[ii + i];
}
else if (Which == 1)
{
// max
tr = mx[ii];
for (i = 1; i < n; ++i)
tr = tr > mx[ii + i] ? tr : mx[ii + i];
}
else if (Which == 2)
{
// min
tr = mx[ii];
for (i = 1; i < n; ++i)
tr = tr > mx[ii + i] ? mx[ii + i] : tr;
}
// Check result
for (i = 0; i < n; ++i)
{
rr = my[ii + i];
if (rr != tr)
{
log_error("ERROR: sub_group_reduce_%s(%s) mismatch for "
"local id %d in sub group %d in group %d\n",
Which == 0 ? "add"
: (Which == 1 ? "max" : "min"),
TypeName<Ty>::val(), i, j, k);
return -1;
}
}
}
x += nw;
y += nw;
m += 2 * nw;
}
return 0;
}
};
// Scan Inclusive functions
template <typename Ty, int Which> struct SCIN
{
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
{
int i, ii, j, k, n;
int nj = (nw + ns - 1) / ns;
ii = 0;
for (k = 0; k < ng; ++k)
{
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
for (i = 0; i < n; ++i)
// t[ii+i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
// % ns + 1);
t[ii + i] = (Ty)i;
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
x[j] = t[i];
}
x += nw;
m += 2 * nw;
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
int ng)
{
int ii, i, j, k, n;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
log_info(" sub_group_scan_inclusive_%s(%s)...\n",
Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
TypeName<Ty>::val());
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
mx[i] = x[j];
my[i] = y[j];
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
// Check result
for (i = 0; i < n; ++i)
{
if (Which == 0)
{
tr = i == 0 ? mx[ii] : tr + mx[ii + i];
}
else if (Which == 1)
{
tr = i == 0 ? mx[ii]
: (tr > mx[ii + i] ? tr : mx[ii + i]);
}
else
{
tr = i == 0 ? mx[ii]
: (tr > mx[ii + i] ? mx[ii + i] : tr);
}
rr = my[ii + i];
if (rr != tr)
{
log_error(
"ERROR: sub_group_scan_inclusive_%s(%s) mismatch "
"for local id %d in sub group %d in group %d\n",
Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
TypeName<Ty>::val(), i, j, k);
return -1;
}
}
}
x += nw;
y += nw;
m += 2 * nw;
}
return 0;
}
};
// Scan Exclusive functions
template <typename Ty, int Which> struct SCEX
{
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
{
int i, ii, j, k, n;
int nj = (nw + ns - 1) / ns;
ii = 0;
for (k = 0; k < ng; ++k)
{
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
for (i = 0; i < n; ++i)
t[ii + i] = (Ty)(
(int)(genrand_int32(gMTdata) & 0x7fffffff) % ns + 1);
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
x[j] = t[i];
}
x += nw;
m += 2 * nw;
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
int ng)
{
int ii, i, j, k, n;
int nj = (nw + ns - 1) / ns;
Ty tr, trt, rr;
log_info(" sub_group_scan_exclusive_%s(%s)...\n",
Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
TypeName<Ty>::val());
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
mx[i] = x[j];
my[i] = y[j];
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
// Check result
for (i = 0; i < n; ++i)
{
if (Which == 0)
{
tr = i == 0 ? TypeIdentity<Ty, Which>::val() : tr + trt;
}
else if (Which == 1)
{
tr = i == 0 ? TypeIdentity<Ty, Which>::val()
: (trt > tr ? trt : tr);
}
else
{
tr = i == 0 ? TypeIdentity<Ty, Which>::val()
: (trt > tr ? tr : trt);
}
trt = mx[ii + i];
rr = my[ii + i];
if (rr != tr)
{
log_error(
"ERROR: sub_group_scan_exclusive_%s(%s) mismatch "
"for local id %d in sub group %d in group %d\n",
Which == 0 ? "add" : (Which == 1 ? "max" : "min"),
TypeName<Ty>::val(), i, j, k);
return -1;
}
}
}
x += nw;
y += nw;
m += 2 * nw;
}
return 0;
}
};
// Broadcast functios
template <typename Ty> struct BC
{
static void gen(Ty *x, Ty *t, cl_int *m, int ns, int nw, int ng)
{
int i, ii, j, k, l, n;
int nj = (nw + ns - 1) / ns;
int d = ns > 100 ? 100 : ns;
ii = 0;
for (k = 0; k < ng; ++k)
{
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
l = (int)(genrand_int32(gMTdata) & 0x7fffffff)
% (d > n ? n : d);
for (i = 0; i < n; ++i)
t[ii + i] = (Ty)((int)(genrand_int32(gMTdata) & 0x7fffffff)
% 100 * 100
+ l);
}
// Now map into work group using map from device
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
x[j] = t[i];
}
x += nw;
m += 2 * nw;
}
}
static int chk(Ty *x, Ty *y, Ty *mx, Ty *my, cl_int *m, int ns, int nw,
int ng)
{
int ii, i, j, k, l, n;
int nj = (nw + ns - 1) / ns;
Ty tr, rr;
log_info(" sub_group_broadcast(%s)...\n", TypeName<Ty>::val());
for (k = 0; k < ng; ++k)
{
// Map to array indexed to array indexed by local ID and sub group
for (j = 0; j < nw; ++j)
{
i = m[2 * j + 1] * ns + m[2 * j];
mx[i] = x[j];
my[i] = y[j];
}
for (j = 0; j < nj; ++j)
{
ii = j * ns;
n = ii + ns > nw ? nw - ii : ns;
l = (int)mx[ii] % 100;
tr = mx[ii + l];
// Check result
for (i = 0; i < n; ++i)
{
rr = my[ii + i];
if (rr != tr)
{
log_error("ERROR: sub_group_broadcast(%s) mismatch for "
"local id %d in sub group %d in group %d\n",
TypeName<Ty>::val(), i, j, k);
return -1;
}
}
}
x += nw;
y += nw;
m += 2 * nw;
}
return 0;
}
};
#define G 2000
#define L 200
struct run_for_type
{
run_for_type(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements,
bool useCoreSubgroups)
{
device_ = device;
context_ = context;
queue_ = queue;
num_elements_ = num_elements;
useCoreSubgroups_ = useCoreSubgroups;
}
template <typename T> cl_int run()
{
cl_int error;
error = test<T, BC<T>, G, L>::run(device_, context_, queue_,
num_elements_, "test_bcast",
bcast_source, 0, useCoreSubgroups_);
error |= test<T, RED<T, 0>, G, L>::run(
device_, context_, queue_, num_elements_, "test_redadd",
redadd_source, 0, useCoreSubgroups_);
error |= test<T, RED<T, 1>, G, L>::run(
device_, context_, queue_, num_elements_, "test_redmax",
redmax_source, 0, useCoreSubgroups_);
error |= test<T, RED<T, 2>, G, L>::run(
device_, context_, queue_, num_elements_, "test_redmin",
redmin_source, 0, useCoreSubgroups_);
error |= test<T, SCIN<T, 0>, G, L>::run(
device_, context_, queue_, num_elements_, "test_scinadd",
scinadd_source, 0, useCoreSubgroups_);
error |= test<T, SCIN<T, 1>, G, L>::run(
device_, context_, queue_, num_elements_, "test_scinmax",
scinmax_source, 0, useCoreSubgroups_);
error |= test<T, SCIN<T, 2>, G, L>::run(
device_, context_, queue_, num_elements_, "test_scinmin",
scinmin_source, 0, useCoreSubgroups_);
error |= test<T, SCEX<T, 0>, G, L>::run(
device_, context_, queue_, num_elements_, "test_scexadd",
scexadd_source, 0, useCoreSubgroups_);
error |= test<T, SCEX<T, 1>, G, L>::run(
device_, context_, queue_, num_elements_, "test_scexmax",
scexmax_source, 0, useCoreSubgroups_);
error |= test<T, SCEX<T, 2>, G, L>::run(
device_, context_, queue_, num_elements_, "test_scexmin",
scexmin_source, 0, useCoreSubgroups_);
return error;
}
private:
cl_device_id device_;
cl_context context_;
cl_command_queue queue_;
int num_elements_;
bool useCoreSubgroups_;
};
// Entry point from main
int test_work_group_functions(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements,
bool useCoreSubgroups)
{
int error;
error = test<int, AA<0>, G, L>::run(device, context, queue, num_elements,
"test_any", any_source, 0,
useCoreSubgroups);
error |= test<int, AA<1>, G, L>::run(device, context, queue, num_elements,
"test_all", all_source, 0,
useCoreSubgroups);
run_for_type rft(device, context, queue, num_elements, useCoreSubgroups);
error |= rft.run<cl_uint>();
error |= rft.run<cl_int>();
error |= rft.run<cl_ulong>();
error |= rft.run<cl_long>();
error |= rft.run<float>();
error |= rft.run<double>();
// error |= rft.run<cl_half>();
return error;
}
int test_work_group_functions_core(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements)
{
return test_work_group_functions(device, context, queue, num_elements,
true);
}
int test_work_group_functions_ext(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements)
{
bool hasExtension = is_extension_available(device, "cl_khr_subgroups");
if (!hasExtension)
{
log_info(
"Device does not support 'cl_khr_subgroups'. Skipping the test.\n");
return TEST_SKIPPED_ITSELF;
}
return test_work_group_functions(device, context, queue, num_elements,
false);
}