Refactor divergence mask handling in subgroup tests (#1379)

This changes compilation of subgroup test kernels so that a separate
compilation is no longer performed for each divergence mask value.

The divergence mask is now passed as a kernel argument.

This also fixes all subgroup_functions_non_uniform_arithmetic testing
and the sub_group_elect and sub_group_any/all_equal subtests of the
subgroup_functions_non_uniform_vote test to use the correct order of
vector components for GPUs with a subgroup size greater than 64.

The conversion of divergence mask bitsets to uint4 vectors has been
corrected to match code comments in WorkGroupParams::load_masks()
in test_conformance/subgroups/subhelpers.h.

Signed-off-by: Stuart Brady <stuart.brady@arm.com>
This commit is contained in:
Stuart Brady
2022-01-19 14:17:54 +00:00
committed by GitHub
parent 06415f8b79
commit 656886030b
3 changed files with 96 additions and 98 deletions

View File

@@ -34,12 +34,24 @@ extern MTdata gMTdata;
typedef std::bitset<128> bs128; typedef std::bitset<128> bs128;
extern cl_half_rounding_mode g_rounding_mode; extern cl_half_rounding_mode g_rounding_mode;
static cl_uint4 bs128_to_cl_uint4(bs128 v)
{
bs128 bs128_ffffffff = 0xffffffffU;
cl_uint4 r;
r.s0 = ((v >> 0) & bs128_ffffffff).to_ulong();
r.s1 = ((v >> 32) & bs128_ffffffff).to_ulong();
r.s2 = ((v >> 64) & bs128_ffffffff).to_ulong();
r.s3 = ((v >> 96) & bs128_ffffffff).to_ulong();
return r;
}
struct WorkGroupParams struct WorkGroupParams
{ {
WorkGroupParams(size_t gws, size_t lws, WorkGroupParams(size_t gws, size_t lws, int dm_arg = -1)
bool use_mask = false)
: global_workgroup_size(gws), local_workgroup_size(lws), : global_workgroup_size(gws), local_workgroup_size(lws),
use_masks(use_mask) divergence_mask_arg(dm_arg)
{ {
subgroup_size = 0; subgroup_size = 0;
work_items_mask = 0; work_items_mask = 0;
@@ -54,7 +66,7 @@ struct WorkGroupParams
int dynsc; int dynsc;
bool use_core_subgroups; bool use_core_subgroups;
std::vector<bs128> all_work_item_masks; std::vector<bs128> all_work_item_masks;
bool use_masks; int divergence_mask_arg;
void save_kernel_source(const std::string &source, std::string name = "") void save_kernel_source(const std::string &source, std::string name = "")
{ {
if (name == "") if (name == "")
@@ -84,7 +96,7 @@ private:
std::map<std::string, std::string> kernel_function_name; std::map<std::string, std::string> kernel_function_name;
void load_masks() void load_masks()
{ {
if (use_masks) if (divergence_mask_arg != -1)
{ {
// 1 in string will be set 1, 0 will be set 0 // 1 in string will be set 1, 0 will be set 0
bs128 mask_0xf0f0f0f0("11110000111100001111000011110000" bs128 mask_0xf0f0f0f0("11110000111100001111000011110000"
@@ -1375,47 +1387,7 @@ static int run_kernel(cl_context context, cl_command_queue queue,
// Driver for testing a single built in function // Driver for testing a single built in function
template <typename Ty, typename Fns, size_t TSIZE = 0> struct test template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
{ {
static test_status mrun(cl_device_id device, cl_context context, static test_status run(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements,
const char *kname, const char *src,
WorkGroupParams test_params)
{
Fns::log_test(test_params, "");
test_status combined_error = TEST_SKIPPED_ITSELF;
for (auto &mask : test_params.all_work_item_masks)
{
test_params.work_items_mask = mask;
test_status error = do_run(device, context, queue, num_elements,
kname, src, test_params);
if (error == TEST_FAIL
|| (error == TEST_PASS && combined_error != TEST_FAIL))
combined_error = error;
}
if (combined_error == TEST_PASS)
{
Fns::log_test(test_params, " passed");
}
return combined_error;
};
static int run(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements, const char *kname,
const char *src, WorkGroupParams test_params)
{
Fns::log_test(test_params, "");
int error = do_run(device, context, queue, num_elements, kname, src,
test_params);
if (error == TEST_PASS)
{
Fns::log_test(test_params, " passed");
}
return error;
};
static test_status do_run(cl_device_id device, cl_context context,
cl_command_queue queue, int num_elements, cl_command_queue queue, int num_elements,
const char *kname, const char *src, const char *kname, const char *src,
WorkGroupParams test_params) WorkGroupParams test_params)
@@ -1436,25 +1408,8 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
std::vector<Ty> mapout; std::vector<Ty> mapout;
mapout.resize(local); mapout.resize(local);
std::stringstream kernel_sstr; std::stringstream kernel_sstr;
if (test_params.use_masks)
{
// Prapare uint4 type to store bitmask on kernel OpenCL C side
// To keep order the first characet in string is the lowest bit
// there was a need to give such offset to bitset constructor
// (first highest offset = 96)
std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(),
96, 32);
std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(),
64, 32);
std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(),
32, 32);
std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(),
0, 32);
kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b"
<< bits_1_32 << ",0b" << bits_33_64 << ",0b"
<< bits_65_96 << ",0b" << bits_97_128 << ");\n";
}
Fns::log_test(test_params, "");
kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS "; kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS ";
kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n"; kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n";
@@ -1563,6 +1518,18 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
idata.resize(input_array_size); idata.resize(input_array_size);
odata.resize(output_array_size); odata.resize(output_array_size);
if (test_params.divergence_mask_arg != -1)
{
cl_uint4 mask_vector;
mask_vector.x = 0xffffffffU;
mask_vector.y = 0xffffffffU;
mask_vector.z = 0xffffffffU;
mask_vector.w = 0xffffffffU;
error = clSetKernelArg(kernel, test_params.divergence_mask_arg,
sizeof(cl_uint4), &mask_vector);
test_error_fail(error, "Unable to set divergence mask argument");
}
// Run the kernel once on zeroes to get the map // Run the kernel once on zeroes to get the map
memset(idata.data(), 0, input_array_size * sizeof(Ty)); memset(idata.data(), 0, input_array_size * sizeof(Ty));
error = run_kernel(context, queue, kernel, global, local, idata.data(), error = run_kernel(context, queue, kernel, global, local, idata.data(),
@@ -1572,25 +1539,65 @@ template <typename Ty, typename Fns, size_t TSIZE = 0> struct test
test_error_fail(error, "Running kernel first time failed"); test_error_fail(error, "Running kernel first time failed");
// Generate the desired input for the kernel // Generate the desired input for the kernel
test_params.subgroup_size = subgroup_size; test_params.subgroup_size = subgroup_size;
Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params); Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params);
error = run_kernel(context, queue, kernel, global, local, idata.data(),
test_status combined_status;
if (test_params.divergence_mask_arg != -1)
{
combined_status = TEST_SKIPPED_ITSELF;
for (auto &mask : test_params.all_work_item_masks)
{
test_params.work_items_mask = mask;
cl_uint4 mask_vector = bs128_to_cl_uint4(mask);
clSetKernelArg(kernel, test_params.divergence_mask_arg,
sizeof(cl_uint4), &mask_vector);
error = run_kernel(context, queue, kernel, global, local,
idata.data(), input_array_size * sizeof(Ty),
sgmap.data(), global * sizeof(cl_int4),
odata.data(), output_array_size * sizeof(Ty),
TSIZE * sizeof(Ty));
test_error_fail(error, "Running kernel second time failed");
// Check the result
test_status status =
Fns::chk(idata.data(), odata.data(), mapin.data(),
mapout.data(), sgmap.data(), test_params);
if (status == TEST_FAIL
|| (status == TEST_PASS && combined_status != TEST_FAIL))
combined_status = status;
if (status == TEST_FAIL) break;
}
}
else
{
error =
run_kernel(context, queue, kernel, global, local, idata.data(),
input_array_size * sizeof(Ty), sgmap.data(), input_array_size * sizeof(Ty), sgmap.data(),
global * sizeof(cl_int4), odata.data(), global * sizeof(cl_int4), odata.data(),
output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); output_array_size * sizeof(Ty), TSIZE * sizeof(Ty));
test_error_fail(error, "Running kernel second time failed"); test_error_fail(error, "Running kernel second time failed");
// Check the result // Check the result
test_status status = Fns::chk(idata.data(), odata.data(), mapin.data(), combined_status =
Fns::chk(idata.data(), odata.data(), mapin.data(),
mapout.data(), sgmap.data(), test_params); mapout.data(), sgmap.data(), test_params);
}
// Detailed failure and skip messages should be logged by Fns::gen // Detailed failure and skip messages should be logged by Fns::gen
// and Fns::chk. // and Fns::chk.
if (status == TEST_FAIL) if (combined_status == TEST_PASS)
{
Fns::log_test(test_params, " passed");
}
else if (combined_status == TEST_FAIL)
{ {
test_fail("Data verification failed\n"); test_fail("Data verification failed\n");
} }
return status; return combined_status;
} }
}; };
@@ -1643,18 +1650,9 @@ struct RunTestForType
std::regex_replace(test_params_.get_kernel_source(function_name), std::regex_replace(test_params_.get_kernel_source(function_name),
std::regex("\\%s"), function_name); std::regex("\\%s"), function_name);
std::string kernel_name = "test_" + function_name; std::string kernel_name = "test_" + function_name;
if (test_params_.all_work_item_masks.size() > 0) error =
{ test<T, U>::run(device_, context_, queue_, num_elements_,
error = test<T, U>::mrun(device_, context_, queue_, num_elements_, kernel_name.c_str(), source.c_str(), test_params_);
kernel_name.c_str(), source.c_str(),
test_params_);
}
else
{
error = test<T, U>::run(device_, context_, queue_, num_elements_,
kernel_name.c_str(), source.c_str(),
test_params_);
}
// If we return TEST_SKIPPED_ITSELF here, then an entire suite may be // If we return TEST_SKIPPED_ITSELF here, then an entire suite may be
// reported as having been skipped even if some tests within it // reported as having been skipped even if some tests within it

View File

@@ -21,7 +21,7 @@
namespace { namespace {
std::string sub_group_non_uniform_arithmetic_source = R"( std::string sub_group_non_uniform_arithmetic_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
int gid = get_global_id(0); int gid = get_global_id(0);
XY(xy,gid); XY(xy,gid);
uint subgroup_local_id = get_sub_group_local_id(); uint subgroup_local_id = get_sub_group_local_id();
@@ -32,9 +32,9 @@ std::string sub_group_non_uniform_arithmetic_source = R"(
} else if(subgroup_local_id < 64) { } else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y; work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) { } else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z; work_item_mask = work_item_mask_vector.z;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.w;
} }
if (elect_work_item & work_item_mask){ if (elect_work_item & work_item_mask){
out[gid] = %s(in[gid]); out[gid] = %s(in[gid]);
@@ -136,7 +136,7 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device,
constexpr size_t global_work_size = 2000; constexpr size_t global_work_size = 2000;
constexpr size_t local_work_size = 200; constexpr size_t local_work_size = 200;
WorkGroupParams test_params(global_work_size, local_work_size, true); WorkGroupParams test_params(global_work_size, local_work_size, 3);
test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source); test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source);
RunTestForType rft(device, context, queue, num_elements, test_params); RunTestForType rft(device, context, queue, num_elements, test_params);

View File

@@ -202,7 +202,7 @@ template <typename T, NonUniformVoteOp operation> struct VOTE
}; };
std::string sub_group_elect_source = R"( std::string sub_group_elect_source = R"(
__kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) { __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
int gid = get_global_id(0); int gid = get_global_id(0);
XY(xy,gid); XY(xy,gid);
uint subgroup_local_id = get_sub_group_local_id(); uint subgroup_local_id = get_sub_group_local_id();
@@ -213,9 +213,9 @@ std::string sub_group_elect_source = R"(
} else if(subgroup_local_id < 64) { } else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y; work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) { } else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z; work_item_mask = work_item_mask_vector.z;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.w;
} }
if (elect_work_item & work_item_mask){ if (elect_work_item & work_item_mask){
out[gid] = sub_group_elect(); out[gid] = sub_group_elect();
@@ -224,7 +224,7 @@ std::string sub_group_elect_source = R"(
)"; )";
std::string sub_group_non_uniform_any_all_all_equal_source = R"( std::string sub_group_non_uniform_any_all_all_equal_source = R"(
__kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) {
int gid = get_global_id(0); int gid = get_global_id(0);
XY(xy,gid); XY(xy,gid);
uint subgroup_local_id = get_sub_group_local_id(); uint subgroup_local_id = get_sub_group_local_id();
@@ -235,9 +235,9 @@ std::string sub_group_non_uniform_any_all_all_equal_source = R"(
} else if(subgroup_local_id < 64) { } else if(subgroup_local_id < 64) {
work_item_mask = work_item_mask_vector.y; work_item_mask = work_item_mask_vector.y;
} else if(subgroup_local_id < 96) { } else if(subgroup_local_id < 96) {
work_item_mask = work_item_mask_vector.w;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.z; work_item_mask = work_item_mask_vector.z;
} else if(subgroup_local_id < 128) {
work_item_mask = work_item_mask_vector.w;
} }
if (elect_work_item & work_item_mask){ if (elect_work_item & work_item_mask){
out[gid] = %s(in[gid]); out[gid] = %s(in[gid]);
@@ -267,7 +267,7 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device,
constexpr size_t global_work_size = 170; constexpr size_t global_work_size = 170;
constexpr size_t local_work_size = 64; constexpr size_t local_work_size = 64;
WorkGroupParams test_params(global_work_size, local_work_size, true); WorkGroupParams test_params(global_work_size, local_work_size, 3);
test_params.save_kernel_source( test_params.save_kernel_source(
sub_group_non_uniform_any_all_all_equal_source); sub_group_non_uniform_any_all_all_equal_source);
test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect"); test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");