diff --git a/test_conformance/subgroups/subhelpers.h b/test_conformance/subgroups/subhelpers.h index 30105a57..aa4abc96 100644 --- a/test_conformance/subgroups/subhelpers.h +++ b/test_conformance/subgroups/subhelpers.h @@ -34,12 +34,24 @@ extern MTdata gMTdata; typedef std::bitset<128> bs128; extern cl_half_rounding_mode g_rounding_mode; +static cl_uint4 bs128_to_cl_uint4(bs128 v) +{ + bs128 bs128_ffffffff = 0xffffffffU; + + cl_uint4 r; + r.s0 = ((v >> 0) & bs128_ffffffff).to_ulong(); + r.s1 = ((v >> 32) & bs128_ffffffff).to_ulong(); + r.s2 = ((v >> 64) & bs128_ffffffff).to_ulong(); + r.s3 = ((v >> 96) & bs128_ffffffff).to_ulong(); + + return r; +} + struct WorkGroupParams { - WorkGroupParams(size_t gws, size_t lws, - bool use_mask = false) + WorkGroupParams(size_t gws, size_t lws, int dm_arg = -1) : global_workgroup_size(gws), local_workgroup_size(lws), - use_masks(use_mask) + divergence_mask_arg(dm_arg) { subgroup_size = 0; work_items_mask = 0; @@ -54,7 +66,7 @@ struct WorkGroupParams int dynsc; bool use_core_subgroups; std::vector all_work_item_masks; - bool use_masks; + int divergence_mask_arg; void save_kernel_source(const std::string &source, std::string name = "") { if (name == "") @@ -84,7 +96,7 @@ private: std::map kernel_function_name; void load_masks() { - if (use_masks) + if (divergence_mask_arg != -1) { // 1 in string will be set 1, 0 will be set 0 bs128 mask_0xf0f0f0f0("11110000111100001111000011110000" @@ -1375,50 +1387,10 @@ static int run_kernel(cl_context context, cl_command_queue queue, // Driver for testing a single built in function template struct test { - static test_status mrun(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, - const char *kname, const char *src, - WorkGroupParams test_params) - { - Fns::log_test(test_params, ""); - - test_status combined_error = TEST_SKIPPED_ITSELF; - for (auto &mask : test_params.all_work_item_masks) - { - test_params.work_items_mask = mask; - test_status error = do_run(device, context, queue, num_elements, - kname, src, test_params); - - if (error == TEST_FAIL - || (error == TEST_PASS && combined_error != TEST_FAIL)) - combined_error = error; - } - - if (combined_error == TEST_PASS) - { - Fns::log_test(test_params, " passed"); - } - return combined_error; - }; - static int run(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, const char *kname, - const char *src, WorkGroupParams test_params) - { - Fns::log_test(test_params, ""); - - int error = do_run(device, context, queue, num_elements, kname, src, - test_params); - - if (error == TEST_PASS) - { - Fns::log_test(test_params, " passed"); - } - return error; - }; - static test_status do_run(cl_device_id device, cl_context context, - cl_command_queue queue, int num_elements, - const char *kname, const char *src, - WorkGroupParams test_params) + static test_status run(cl_device_id device, cl_context context, + cl_command_queue queue, int num_elements, + const char *kname, const char *src, + WorkGroupParams test_params) { size_t tmp; cl_int error; @@ -1436,25 +1408,8 @@ template struct test std::vector mapout; mapout.resize(local); std::stringstream kernel_sstr; - if (test_params.use_masks) - { - // Prapare uint4 type to store bitmask on kernel OpenCL C side - // To keep order the first characet in string is the lowest bit - // there was a need to give such offset to bitset constructor - // (first highest offset = 96) - std::bitset<32> bits_1_32(test_params.work_items_mask.to_string(), - 96, 32); - std::bitset<32> bits_33_64(test_params.work_items_mask.to_string(), - 64, 32); - std::bitset<32> bits_65_96(test_params.work_items_mask.to_string(), - 32, 32); - std::bitset<32> bits_97_128(test_params.work_items_mask.to_string(), - 0, 32); - kernel_sstr << "global uint4 work_item_mask_vector = (uint4)(0b" - << bits_1_32 << ",0b" << bits_33_64 << ",0b" - << bits_65_96 << ",0b" << bits_97_128 << ");\n"; - } + Fns::log_test(test_params, ""); kernel_sstr << "#define NR_OF_ACTIVE_WORK_ITEMS "; kernel_sstr << NR_OF_ACTIVE_WORK_ITEMS << "\n"; @@ -1563,6 +1518,18 @@ template struct test idata.resize(input_array_size); odata.resize(output_array_size); + if (test_params.divergence_mask_arg != -1) + { + cl_uint4 mask_vector; + mask_vector.x = 0xffffffffU; + mask_vector.y = 0xffffffffU; + mask_vector.z = 0xffffffffU; + mask_vector.w = 0xffffffffU; + error = clSetKernelArg(kernel, test_params.divergence_mask_arg, + sizeof(cl_uint4), &mask_vector); + test_error_fail(error, "Unable to set divergence mask argument"); + } + // Run the kernel once on zeroes to get the map memset(idata.data(), 0, input_array_size * sizeof(Ty)); error = run_kernel(context, queue, kernel, global, local, idata.data(), @@ -1572,25 +1539,65 @@ template struct test test_error_fail(error, "Running kernel first time failed"); // Generate the desired input for the kernel - test_params.subgroup_size = subgroup_size; Fns::gen(idata.data(), mapin.data(), sgmap.data(), test_params); - error = run_kernel(context, queue, kernel, global, local, idata.data(), + + test_status combined_status; + + if (test_params.divergence_mask_arg != -1) + { + combined_status = TEST_SKIPPED_ITSELF; + + for (auto &mask : test_params.all_work_item_masks) + { + test_params.work_items_mask = mask; + cl_uint4 mask_vector = bs128_to_cl_uint4(mask); + clSetKernelArg(kernel, test_params.divergence_mask_arg, + sizeof(cl_uint4), &mask_vector); + error = run_kernel(context, queue, kernel, global, local, + idata.data(), input_array_size * sizeof(Ty), + sgmap.data(), global * sizeof(cl_int4), + odata.data(), output_array_size * sizeof(Ty), + TSIZE * sizeof(Ty)); + test_error_fail(error, "Running kernel second time failed"); + + // Check the result + test_status status = + Fns::chk(idata.data(), odata.data(), mapin.data(), + mapout.data(), sgmap.data(), test_params); + + if (status == TEST_FAIL + || (status == TEST_PASS && combined_status != TEST_FAIL)) + combined_status = status; + + if (status == TEST_FAIL) break; + } + } + else + { + error = + run_kernel(context, queue, kernel, global, local, idata.data(), input_array_size * sizeof(Ty), sgmap.data(), global * sizeof(cl_int4), odata.data(), output_array_size * sizeof(Ty), TSIZE * sizeof(Ty)); - test_error_fail(error, "Running kernel second time failed"); + test_error_fail(error, "Running kernel second time failed"); - // Check the result - test_status status = Fns::chk(idata.data(), odata.data(), mapin.data(), - mapout.data(), sgmap.data(), test_params); + // Check the result + combined_status = + Fns::chk(idata.data(), odata.data(), mapin.data(), + mapout.data(), sgmap.data(), test_params); + } // Detailed failure and skip messages should be logged by Fns::gen // and Fns::chk. - if (status == TEST_FAIL) + if (combined_status == TEST_PASS) + { + Fns::log_test(test_params, " passed"); + } + else if (combined_status == TEST_FAIL) { test_fail("Data verification failed\n"); } - return status; + return combined_status; } }; @@ -1643,18 +1650,9 @@ struct RunTestForType std::regex_replace(test_params_.get_kernel_source(function_name), std::regex("\\%s"), function_name); std::string kernel_name = "test_" + function_name; - if (test_params_.all_work_item_masks.size() > 0) - { - error = test::mrun(device_, context_, queue_, num_elements_, - kernel_name.c_str(), source.c_str(), - test_params_); - } - else - { - error = test::run(device_, context_, queue_, num_elements_, - kernel_name.c_str(), source.c_str(), - test_params_); - } + error = + test::run(device_, context_, queue_, num_elements_, + kernel_name.c_str(), source.c_str(), test_params_); // If we return TEST_SKIPPED_ITSELF here, then an entire suite may be // reported as having been skipped even if some tests within it diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp index 5ab45222..02fc507b 100644 --- a/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp +++ b/test_conformance/subgroups/test_subgroup_non_uniform_arithmetic.cpp @@ -21,7 +21,7 @@ namespace { std::string sub_group_non_uniform_arithmetic_source = R"( - __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { int gid = get_global_id(0); XY(xy,gid); uint subgroup_local_id = get_sub_group_local_id(); @@ -32,9 +32,9 @@ std::string sub_group_non_uniform_arithmetic_source = R"( } else if(subgroup_local_id < 64) { work_item_mask = work_item_mask_vector.y; } else if(subgroup_local_id < 96) { - work_item_mask = work_item_mask_vector.w; - } else if(subgroup_local_id < 128) { work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; } if (elect_work_item & work_item_mask){ out[gid] = %s(in[gid]); @@ -136,7 +136,7 @@ int test_subgroup_functions_non_uniform_arithmetic(cl_device_id device, constexpr size_t global_work_size = 2000; constexpr size_t local_work_size = 200; - WorkGroupParams test_params(global_work_size, local_work_size, true); + WorkGroupParams test_params(global_work_size, local_work_size, 3); test_params.save_kernel_source(sub_group_non_uniform_arithmetic_source); RunTestForType rft(device, context, queue, num_elements, test_params); diff --git a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp index 3f0985e2..3be1ba30 100644 --- a/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp +++ b/test_conformance/subgroups/test_subgroup_non_uniform_vote.cpp @@ -202,7 +202,7 @@ template struct VOTE }; std::string sub_group_elect_source = R"( - __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out) { + __kernel void test_sub_group_elect(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { int gid = get_global_id(0); XY(xy,gid); uint subgroup_local_id = get_sub_group_local_id(); @@ -213,9 +213,9 @@ std::string sub_group_elect_source = R"( } else if(subgroup_local_id < 64) { work_item_mask = work_item_mask_vector.y; } else if(subgroup_local_id < 96) { - work_item_mask = work_item_mask_vector.w; - } else if(subgroup_local_id < 128) { work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; } if (elect_work_item & work_item_mask){ out[gid] = sub_group_elect(); @@ -224,7 +224,7 @@ std::string sub_group_elect_source = R"( )"; std::string sub_group_non_uniform_any_all_all_equal_source = R"( - __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out) { + __kernel void test_%s(const __global Type *in, __global int4 *xy, __global Type *out, uint4 work_item_mask_vector) { int gid = get_global_id(0); XY(xy,gid); uint subgroup_local_id = get_sub_group_local_id(); @@ -235,9 +235,9 @@ std::string sub_group_non_uniform_any_all_all_equal_source = R"( } else if(subgroup_local_id < 64) { work_item_mask = work_item_mask_vector.y; } else if(subgroup_local_id < 96) { - work_item_mask = work_item_mask_vector.w; - } else if(subgroup_local_id < 128) { work_item_mask = work_item_mask_vector.z; + } else if(subgroup_local_id < 128) { + work_item_mask = work_item_mask_vector.w; } if (elect_work_item & work_item_mask){ out[gid] = %s(in[gid]); @@ -267,7 +267,7 @@ int test_subgroup_functions_non_uniform_vote(cl_device_id device, constexpr size_t global_work_size = 170; constexpr size_t local_work_size = 64; - WorkGroupParams test_params(global_work_size, local_work_size, true); + WorkGroupParams test_params(global_work_size, local_work_size, 3); test_params.save_kernel_source( sub_group_non_uniform_any_all_all_equal_source); test_params.save_kernel_source(sub_group_elect_source, "sub_group_elect");